import string import array import sys class Edge: count = 0 def __init__(self, dstNode, first, last): self.dstNode = dstNode self.first = first self.last = last Edge.count += 1 def split(self, suffix, suffixTree): # add new explicit node newIndex = len(suffixTree.nodes) suffixTree.nodes.append(0) # add new suffix edge newFirst = self.first + len(suffix) suffixTree.edgeLookup[newIndex, suffixTree.string[newFirst]] = Edge(self.dstNode, newFirst, self.last) # shorten this edge self.last = newFirst - 1 self.dstNode = newIndex return newIndex def isLeafEdge(self, suffixTree): return (self.last == suffixTree.lastIndex) def __len__(self): return self.last - self.first + 1 def __repr__(self): return "Edge(%d, %d, %d)" % (self.dstNode, self.first, self.last) class Suffix: def __init__(self, srcNode, first, last): self.srcNode = srcNode self.first = first self.last = last def is_explicit(self): return self.first > self.last def is_implicit(self): return self.first <= self.last def canonicalize(self, suffixTree): if self.is_implicit(): edge = suffixTree.edgeLookup[self.srcNode, suffixTree.string[self.first]] if (len(edge) <= len(self)): self.first += len(edge) self.srcNode = edge.dstNode self.canonicalize(suffixTree) def __len__(self): return self.last - self.first + 1 def __repr__(self): return "Suffix(%d, %d, %d)" % (self.srcNode, self.first, self.last) class SuffixTree: def __init__(self, string): self.string = string # save a pointer to the string self.alphabet = set() # alphabet of string self.nodes = array.array('l') # index of ith node's parent self.nodes.append(0) # add root node self.edgeLookup = {} # adjacency list indexed by (srcNode, char) self.lastIndex = len(string) - 1 activePoint = Suffix(0, 0, -1) for i in xrange(len(string)): self.alphabet.add(string[i]) self.addPrefix(i, activePoint) def addPrefix(self, last, activePoint): LastParentNode = -1 while True: ParentNode = activePoint.srcNode if activePoint.is_explicit(): if (activePoint.srcNode, self.string[last]) in self.edgeLookup: break else: #potentially split an implicit node edge = self.edgeLookup[activePoint.srcNode, self.string[activePoint.first]] if (self.string[edge.first + len(activePoint)] == self.string[last]): break else: ParentNode = edge.split(activePoint, self) self.nodes.append(-1) self.edgeLookup[ParentNode, self.string[last]] = Edge(len(self.nodes)-1, last, self.lastIndex) # add suffix link if (LastParentNode > 0): self.nodes[LastParentNode] = ParentNode LastParentNode = ParentNode if (activePoint.srcNode == 0): activePoint.first += 1 else: activePoint.srcNode = self.nodes[activePoint.srcNode] activePoint.canonicalize(self) if (LastParentNode > 0): self.nodes[LastParentNode] = ParentNode activePoint.last += 1 activePoint.canonicalize(self) def leafIndices(self, nodeIndex=0, lenSoFar=0): indexList = [] if (self.nodes[nodeIndex] < 0): indexList.append(self.lastIndex + 1 - lenSoFar) else: for char in self.alphabet: try: edge = self.edgeLookup[nodeIndex, char] if edge.isLeafEdge(self): indexList.append(self.lastIndex + 1 - len(edge) - lenSoFar) else: indexList += self.leafIndices(edge.dstNode, lenSoFar + len(edge)) except KeyError: continue return indexList def distinct(self, nodeIndex=0, lenSoFar=0): distinctList = [] # examine children of node for char in self.alphabet: try: edge = self.edgeLookup[nodeIndex, char] if edge.isLeafEdge(self): distinctList.append((edge.first-lenSoFar, lenSoFar+1)) else: distinctList += self.distinct(edge.dstNode, lenSoFar + len(edge)) except: continue return distinctList def thread(self, target): nodeIndex = 0 # start from root i = 0 # characters threaded so far while (i < len(target)): try: edge = self.edgeLookup[nodeIndex, target[i]] prefix = self.string[edge.first:edge.last+1] if (target[i:].startswith(prefix) or prefix.startswith(target[i:])): i += len(prefix) else: return [] nodeIndex = edge.dstNode except KeyError: return [] return self.leafIndices(nodeIndex, i) def printTree(self, nodeIndex=0, prefixLength=0, prefix=""): branches = list() for c in self.alphabet: try: edge = self.edgeLookup[nodeIndex, c] extent = prefix + "(%d)-%s-" % (nodeIndex, self.string[edge.first:edge.last+1]) subtree = self.printTree(edge.dstNode, prefixLength+len(edge), extent) if (len(subtree) > 0): branches.append(subtree) elif (edge.isLeafEdge(self)): print extent + "(%d)[%d]" % (edge.dstNode, edge.first-prefixLength) except KeyError: pass return branches if __name__ == "__main__": if (len(sys.argv) > 1): test = sys.argv[1]+"$" if (len(sys.argv) > 2): target = sys.argv[2] else: target = "beef" else: test = "imissmissmississippiskiss$" target = "iss" rule = "".join([str(i)[-1] for i in xrange(len(test))]) i = 0 while (i < len(rule)): print rule[i:min(i+50,len(rule))] print test[i:min(i+50,len(rule))] i += 50 suffixTree = SuffixTree(test) suffixTree.printTree() print "%s appears at: %s" % (target, sorted(suffixTree.thread(target)))