What we learned from a related "Minimal Superstring" problem
2
3
4
GACGGCGGCGCACGGCGCAA - Our toy sequence from 2 lectures ago GACGG CGCAC ACGGC GCACG CGGCG CACGG - The complete set of 16 5-mers GGCGG ACGGC GCGGC CGGCG CGGCG GGCGC GGCGC GCGCA GGCGA CGCAA
5
def kmersUnique(seq, k):
kmers = sorted([seq[i:i+k] for i in xrange(len(seq)-k+1)])
for i in xrange(1,len(kmers)):
if (kmers[i] == kmers[i-1][0:k]):
t = kmers[i-1].find('_')
if (t >= 0):
n = int(kmers[i-1][t+1:]) + 1
kmers[i] = kmers[i] + "_" + str(n)
else:
kmers[i-1] = kmers[i-1] + "_1"
kmers[i] = kmers[i] + "_2"
return kmers
kmers = kmersUnique("GACGGCGGCGCACGGCGCAA", 5)
print kmers
import itertools
class Graph:
def __init__(self, vlist=[]):
""" Initialize a Graph with an optional vertex list """
self.index = {v:i for i,v in enumerate(vlist)}
self.vertex = {i:v for i,v in enumerate(vlist)}
self.edge = []
self.edgelabel = []
def addVertex(self, label):
""" Add a labeled vertex to the graph """
index = len(self.index)
self.index[label] = index
self.vertex[index] = label
def addEdge(self, vsrc, vdst, label='', repeats=True):
""" Add a directed edge to the graph, with an optional label.
Repeated edges are distinct, unless repeats is set to False. """
e = (self.index[vsrc], self.index[vdst])
if (repeats) or (e not in self.edge):
self.edge.append(e)
self.edgelabel.append(label)
def hamiltonianPath(self):
""" A Brute-force method for finding a Hamiltonian Path.
Basically, all possible N! paths are enumerated and checked
for edges. Since edges can be reused there are no distictions
made for *which* version of a repeated edge. """
for path in itertools.permutations(sorted(self.index.values())):
for i in xrange(len(path)-1):
if ((path[i],path[i+1]) not in self.edge):
break
else:
return [self.vertex[i] for i in path]
return []
def SearchTree(self, path, verticesLeft):
""" A recursive Branch-and-Bound Hamiltonian Path search.
Paths are extended one node at a time using only available
edges from the graph. """
if (len(verticesLeft) == 0):
self.PathV2result = [self.vertex[i] for i in path]
return True
for v in verticesLeft:
if (len(path) == 0) or ((path[-1],v) in self.edge):
if self.SearchTree(path+[v], [r for r in verticesLeft if r != v]):
return True
return False
def hamiltonianPathV2(self):
""" A wrapper function for invoking the Branch-and-Bound
Hamiltonian Path search. """
self.PathV2result = []
self.SearchTree([],sorted(self.index.values()))
return self.PathV2result
def degrees(self):
""" Returns two dictionaries with the inDegree and outDegree
of each node from the graph. """
inDegree = {}
outDegree = {}
for src, dst in self.edge:
outDegree[src] = outDegree.get(src, 0) + 1
inDegree[dst] = inDegree.get(dst, 0) + 1
return inDegree, outDegree
def verifyAndGetStart(self):
inDegree, outDegree = self.degrees()
start = 0
end = 0
for vert in self.vertex.iterkeys():
ins = inDegree.get(vert,0)
outs = outDegree.get(vert,0)
if (ins == outs):
continue
elif (ins - outs == 1):
end = vert
elif (outs - ins == 1):
start = vert
else:
start, end = -1, -1
break
if (start >= 0) and (end >= 0):
return start
else:
return -1
def eulerianPath(self):
graph = [(src,dst) for src,dst in self.edge]
currentVertex = self.verifyAndGetStart()
path = [currentVertex]
# "next" is where vertices get inserted into our tour
# it starts at the end (i.e. it is the same as appending),
# but later "side-trips" will insert in the middle
next = 1
while len(graph) > 0:
for edge in graph:
if (edge[0] == currentVertex):
currentVertex = edge[1]
graph.remove(edge)
path.insert(next, currentVertex)
next += 1
break
else:
for edge in graph:
try:
next = path.index(edge[0]) + 1
currentVertex = edge[0]
break
except ValueError:
continue
else:
print "There is no path!"
return False
return path
def eulerEdges(self, path):
edgeId = {}
for i in xrange(len(self.edge)):
edgeId[self.edge[i]] = edgeId.get(self.edge[i], []) + [i]
edgeList = []
for i in xrange(len(path)-1):
edgeList.append(self.edgelabel[edgeId[path[i],path[i+1]].pop()])
return edgeList
def render(self, highlightPath=[]):
""" Outputs a version of the graph that can be rendered
using graphviz tools (http://www.graphviz.org/)."""
edgeId = {}
for i in xrange(len(self.edge)):
edgeId[self.edge[i]] = edgeId.get(self.edge[i], []) + [i]
edgeSet = set()
for i in xrange(len(highlightPath)-1):
src = self.index[highlightPath[i]]
dst = self.index[highlightPath[i+1]]
edgeSet.add(edgeId[src,dst].pop())
result = ''
result += 'digraph {\n'
result += ' graph [nodesep=2, size="10,10"];\n'
for index, label in self.vertex.iteritems():
result += ' N%d [shape="box", style="rounded", label="%s"];\n' % (index, label)
for i, e in enumerate(self.edge):
src, dst = e
result += ' N%d -> N%d' % (src, dst)
label = self.edgelabel[i]
if (len(label) > 0):
if (i in edgeSet):
result += ' [label="%s", penwidth=3.0]' % (label)
else:
result += ' [label="%s"]' % (label)
elif (i in edgeSet):
result += ' [penwidth=3.0]'
result += ';\n'
result += ' overlap=false;\n'
result += '}\n'
return result
6
k = 5
target = "GACGGCGGCGCACGGCGCAA"
kmers = kmersUnique(target, k)
G1 = Graph(kmers)
for vsrc in kmers:
for vdst in kmers:
if (vsrc[1:k] == vdst[0:k-1]):
G1.addEdge(vsrc,vdst)
path = G1.hamiltonianPathV2()
print path
seq = path[0][0:k]
for kmer in path[1:]:
seq += kmer[k-1]
print seq
print seq == target
7
How do they differ?
There might be even more solutions
Genome assembly is not as unambiguous as the Minimal Substring problem
9
k = 5
target = "GACGGCGGCGCACGGCGCAA"
kmers = kmersUnique(target, k)
print kmers
nodes = sorted(set([code[:k-1] for code in kmers] + [code[1:k] for code in kmers]))
print nodes
G2 = Graph(nodes)
for code in kmers:
G2.addEdge(code[:k-1],code[1:k],code)
path = G2.eulerianPath()
print path
path = G2.eulerEdges(path)
print path
seq = path[0][0:k]
for kmer in path[1:]:
seq += kmer[k-1]
print seq
print seq == target
10
k = 8
target = "GACGGCGGCGCACGGCGCAA"
kmers = kmersUnique(target, k)
print kmers
nodes = sorted(set([code[:k-1] for code in kmers] + [code[1:k] for code in kmers]))
print nodes
G3 = Graph(nodes)
for code in kmers:
G3.addEdge(code[:k-1],code[1:k],code)
path = G3.eulerianPath()
print path
path = G3.eulerEdges(path)
print path
seq = path[0][0:k]
for kmer in path[1:]:
seq += kmer[k-1]
print seq
print seq == target
12
k = 8
target = "GACGGCGGCGCACGGCGCAA"
kmers = kmersUnique(target, k)
G4 = Graph(kmers)
for vsrc in kmers:
for vdst in kmers:
if (vsrc[1:k] == vdst[0:k-1]):
G4.addEdge(vsrc,vdst)
path = G4.hamiltonianPathV2()
print path
seq = path[0][0:k]
for kmer in path[1:]:
seq += kmer[k-1]
print seq
print seq == target
13
k = 5
target = "GACGGCGGCGCACGGCGCAA"
kmers = set([target[i:i+k] for i in xrange(len(target)-k+1)])
nodes = sorted(set([code[:k-1] for code in kmers] + [code[1:k] for code in kmers]))
G5 = Graph(nodes)
for code in kmers:
G5.addEdge(code[:k-1],code[1:k],code)
14