In [41]:
import gzip

def loadChromosome(filename):
    fp = gzip.open(filename, 'rb')
    data = fp.read().split('\n')
    header = data.pop(0)
    data.pop()
    fp.close()
    return ''.join(data)

seq = loadChromosome("HumChrMT.fa.gz")
seq = seq + '$'
print len(seq)
print seq[0:30] + " ... " + seq[-31:]
16570
GATCACAGGTCTATCACCCTATTAACCACT ... CACGTTCCCCTTAAATAAGACATCACGATG$

In [42]:
def linesOf64(s):
    i = 0
    while i + 64 < len(s):
        print s[i:i+64]
        i += 64
    if (i < len(s)):
        print s[i:]

print "Seq = "
linesOf64(seq)
Seq = 
GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTC
TGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATC
TGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACAT
ACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGC
ACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTC
TGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTA
ACCAGATTTCAAATTTTATCTTTTGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACA
TTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCCGCCCATCCTACCCA
GCACACACACACCGCTGCTAACCCCATACCCCGAACCAACCAAACCCCAAAGACACCCCCCACA
GTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCC
ATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCAT
CCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCA
GCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTT
AGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCA
CCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCC
TCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACG
AAAGTGGCTTTAACATATCTGAACACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCAC
TATGCTTAGCCCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGCCA
CAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCTGTAATC
GATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCC
TGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCAT
GAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTATGAAACTTA
AGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAG
CGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTA
CGCATTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAA
CCAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCAACTTAACTTGACCG
CTCTGAGCTAAACCTAGCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCAT
TTACCCAAATAAAGTATAGGCGATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGG
GAAAGATGAAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTTCTGCATAA
TGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGACCCCCGAAACCAGACGAGCT
ACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATTTATAGGTA
GAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTT
TAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTCCAAAGAGGAACA
GCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATTTAACACCCATAGTAGGCC
TAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAAC
ATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAG
TATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTAAAACACTGAACTGAC
AATTAACAGCCCAATATCTACAATCAACCAACAAGTCATTATTACCCTCACTGTCAACCCAACA
CAGGCATGCTCATAAGGAAAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTG
TTTACCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATG
TTTAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATAATCACTTGTTCCTTAAATAGGGAC
CTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCTTACTTTTAACCAGTGAAATTGACCTGCC
CGTGAAGAGGCGGGCATAACACAGCAAGACGAGAAGACCCTATGGAGCTTTAATTTATTAATGC
AAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATTAAAAATTTCGGTTGGG
GCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAGTCAAAGCGAA
CTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGC
AATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCC
CGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTT
CAGACCGGAGTAATCCAGGTCGGTTTCTATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAA
GAGAAATAAGGCCTACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATT
ATACCCACACCCACCCAAGAACAGGGTTTGTTAAGATGGCAGAGCCCGGTAATCGCATAAAACT
TAAAACTTTACAGTCAGAGGTTCAATTCCTCTTCTTAACAACATACCCATGGCCAACCTCCTAC
TCCTCATTGTACCCATTCTAATCGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGG
CTATATACAACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACCCTTCGCT
GACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCACATCTACCATCACCCTCTACA
TCACCGCCCCGACCTTAGCTCTCACCATCGCTCTTCTACTATGAACCCCCCTCCCCATACCCAA
CCCCCTGGTCAACCTCAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTAC
TCAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGCAG
TAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAG
TGGCTCCTTTAACCTCTCCACCCTTATCACAACACAAGAACACCTCTGATTACTCCTGCCATCA
TGACCCTTGGCCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTCGACC
TTGCCGAAGGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGC
CCTATTCTTCATAGCCGAATACACAAACATTATTATAATAAACACCCTCACCACTACAATCTTC
CTAGGAACAACATATGACGCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCC
TACTTCTAACCTCCCTGTTCTTATGAATTCGAACAGCATACCCCCGATTCCGCTACGACCAACT
CATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTAGCATTACTTATATGATATGTCTCC
ATACCCATTACAATCTCCAGCATTCCCCCTCAAACCTAAGAAATATGTCTGATAAAAGAGTTAC
TTTGATAGAGTAAATAATAGGAGCTTAAACCCCCTTATTTCTAGGACTATGAGAATCGAACCCA
TCCCTGAGAATCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTAAAGTAAGGTCAGCTAA
ATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTATACCCTTCCCGTACTAATTAATCCCCT
GGCCCAACCCGTCATCTACTCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCAC
TGATTTTTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCTAACCAAAA
AAATAAACCCTCGTTCCACAGAAGCTGCCATCAAGTATTTCCTCACGCAAGCAACCGCATCCAT
AATCCTTCTAATAGCTATCCTCTTCAACAATATACTCTCCGGACAATGAACCATAACCAATACT
ACCAATCAATACTCATCATTAATAATCATAATAGCTATAGCAATAAAACTAGGAATAGCCCCCT
TTCACTTCTGAGTCCCAGAGGTTACCCAAGGCACCCCTCTGACATCCGGCCTGCTTCTTCTCAC
ATGACAAAAACTAGCCCCCATCTCAATCATATACCAAATCTCTCCCTCACTAAACGTAAGCCTT
CTCCTCACTCTCTCAATCTTATCCATCATAGCAGGCAGTTGAGGTGGATTAAACCAAACCCAGC
TACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAATAGCAGTTCTACCGTA
CAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCCTAACTACTACCGCATTCCTA
CTACTCAACTTAAACTCCAGCACCACGACCCTACTACTATCTCGCACCTGAAACAAGCTAACAT
GACTAACACCCTTAATTCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTT
TTTGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATCATCCCCACCATC
ATAGCCACCATCACCCTCCTTAACCTCTACTTCTACCTACGCCTAATCTACTCCACCTCAATCA
CACTACTCCCCATATCTAACAACGTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCC
ATTCCTCCCCACACTCATCGCCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACTAATA
ATCTTATAGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATA
CTTAATTTCTGTAACAGCTAAGGACTGCAAAACCCCACTCTGCATCAACTGAACGCAAATCAGC
CACTTTAATTAAGCTAAGCCCTTACTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAAC
AGCTAAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCCGCCGGGAAAAAAGGCGGGAG
AAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGAAAATCACCTCGGAG
CTGGTAAAAAGAGGCCTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTT
ACCTCACCCCCACTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGGAA
CACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCTCTAAGCCTCCTTATTCG
AGCCGAGCTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCACATCTACAACGTTATCGTCACA
GCCCATGCATTTGTAATAATCTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACT
GACTAGTTCCCCTAATAATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAACATAAGCTT
CTGACTCTTACCTCCCTCTCTCCTACTCCTGCTCGCATCTGCTATAGTGGAGGCCGGAGCAGGA
ACAGGTTGAACAGTCTACCCTCCCTTAGCAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACC
TAACCATCTTCTCCTTACACCTAGCAGGTGTCTCCTCTATCTTAGGGGCCATCAATTTCATCAC
AACAATTATCAATATAAAACCCCCTGCCATAACCCAATACCAAACGCCCCTCTTCGTCTGATCC
GTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTCCCAGTCCTAGCTGCTGGCATCACTATAC
TACTAACAGACCGCAACCTCAACACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCT
ATACCAACACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCAGGCTTC
GGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGAACCATTTGGATACATAGGTA
TGGTCTGAGCTATGATATCAATTGGCTTCCTAGGGTTTATCGTGTGAGCACACCATATATTTAC
AGTAGGAATAGACGTAGACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCC
ACCGGCGTCAAAGTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATGAAATGATCTGCTG
CAGTGCTCTGAGCCCTAGGATTCATCTTTCTTTTCACCGTAGGTGGCCTGACTGGCATTGTATT
AGCAAACTCATCACTAGACATCGTACTACACGACACGTACTACGTTGTAGCCCACTTCCACTAT
GTCCTATCAATAGGAGCTGTATTTGCCATCATAGGAGGCTTCATTCACTGATTTCCCCTATTCT
CAGGCTACACCCTAGACCAAACCTACGCCAAAATCCATTTCACTATCATATTCATCGGCGTAAA
TCTAACTTTCTTCCCACAACACTTTCTCGGCCTATCCGGAATGCCCCGACGTTACTCGGACTAC
CCCGATGCATACACCACATGAAACATCCTATCATCTGTAGGCTCATTCATTTCTCTAACAGCAG
TAATATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCCTAATAGTAGA
AGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCCCCCACCCTACCACACATTCGAAGAA
CCCGTATACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCC
AACCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGAAAAACCATTTCATAACTTTGTCAA
AGTTAAATTATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCGCAAGTAGGTCTACAAG
ACGCTACTTCCCCTATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTT
CCTTATCTGCTTCCTAGTCCTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACT
AACATCTCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCC
TCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACGAGGTCAACGATCCCTCCCTTAC
CATCAAATCAATTGGCCACCAATGGTACTGAACCTACGAGTACACCGACTACGGCGGACTAATC
TTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTG
ACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTT
GCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTCTAAACCAA
ACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAA
ACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGT
ATTTACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAAC
CTTTTAAGTTAAAGATTAAGAGAACCAACACCTCTTTACAGTGAAATGCCCCAACTAAATACTA
CCGTATGGCCCACCATAATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAAT
ATTAAACACAAACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAA
CCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCT
ACCCGCCGCAGTACTGATCATTCTATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATC
AACAACCGACTAATCACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGATAACCA
TACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTTAATCATTTTTATTGCCAC
AACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACTATCTATAAACCTA
GCCATGGCCATCCCCTTATGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATG
CCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAGTTATTATCGA
AACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTACGCCTAACCGCTAACATTACTGCA
GGCCACCTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCT
CTACACTTATCATCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTAAT
CCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACAACACATAATGACCCACC
AATCACATGCCTATCATATAGTAAAACCCAGCCCATGACCCCTAACAGGGGCCCTCTCAGCCCT
CCTAATGACCTCCGGCCTAGCCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGC
CTACTAACCAACACACTAACCATATACCAATGATGGCGCGATGTAACACGAGAAAGCACATACC
AAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACGGGATAATCCTATTTATTACCTC
AGAAGTTTTTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCC
CAATTAGGAGGGCACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTCC
TAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCACCATAGTCTAATAGA
AAACAACCGAAACCAAATAATTCAAGCACTGCTTATTACAATTTTACTGGGTCTCTATTTTACC
CTCCTACAAGCCTCAGAGTACTTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAA
CATTTTTTGTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCTCACTAT
CTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACATCACTTTGGCTTCGAAGCCGCC
GCCTGATACTGGCATTTTGTAGATGTGGTTTGACTATTTCTGTATGTCTCCATCTATTGATGAG
GGTCTTACTCTTTTAGTATAAATAGTACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCA
AAAAAGAGTAATAAACTTCGCCTTAATTTTAATAATCAACACCCTCCTAGCCTTACTACTAATA
ATTATTACATTTTGACTACCACAACTCAACGGCTACATAGAAAAATCCACCCCTTACGAGTGCG
GCTTCGACCCTATATCCCCCGCCCGCGTCCCTTTCTCCATAAAATTCTTCTTAGTAGCTATTAC
CTTCTTATTATTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAGCCCTACAAACAACT
AACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCT
ATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAGTTTAAACAAAACGAATGA
TTTCGACTCATTAAATTATGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATA
CTAGCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTAC
TATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAACACCCA
CTCCCTCTTAGCCAATATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTG
GGCCTAGCCCTACTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAAACC
TACTCCAATGCTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGACTTTCCAAA
AAACACATAATTTGAATCAACACAACCACCCACAGCCTAATTATTAGCATCATCCCTCTACTAT
TTTTTAACCAAATCAACAACAACCTATTTAGCTGTTCCCCAACCTTTTCCTCCGACCCCCTAAC
AACCCCCCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATCATGGCAAGCCAACGCCAC
TTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCTATACTAATCTCCCTACAAATCT
CCTTAATTATAACATTCACAGCCACAGAACTAATCATATTTTATATCTTCTTCGAAACCACACT
TATCCCCACCTTGGCTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACA
TACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACTAATTTACACTCACA
ACACCCTAGGCTCACTAAACATTCTACTACTCACTCTCACTGCCCAAGAACTATCAAACTCCTG
AGCCAACAACTTAATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGA
CTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCG
CAGTACTCTTAAAACTAGGCGGCTATGGTATAATACGCCTCACACTCATTCTCAACCCCCTGAC
AAAACACATAGCCTACCCCTTCCTTGTACTATCCCTATGAGGCATAATTATAACAAGCTCCATC
TGCCTACGACAAACAGACCTAAAATCGCTCATTGCATACTCTTCAATCAGCCACATAGCCCTCG
TAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCATTCTCATAATCGC
CCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTACGAACGCACTCACAGT
CGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTC
TAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTGTGCT
AGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACA
GCCCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACA
ACATAAAACCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCT
CCTATCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACA
TCAGATTGTGAATCTGACAACAGAGGCTTACGACCCCTTATTTACCGAGAAAGCTCACAAGAAC
TGCTAACTCATGCCCCCATGTCTAACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATC
CATTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACACTA
CTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCACCCTCGTTAACCCTAA
CAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCATCCACCTTTATTATCAGTCTC
TTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCA
CAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATATTCATCCC
TGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATATAAACTCAGACCCA
AACATTAATCAGTTCTTCAAATATCTACTCATCTTCCTAATTACCATACTAATCTTAGTTACCG
CTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCAT
CAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAATCCTATACAACCGT
ATCGGCGATATCGGTTTCATCCTCGCCTTAGCATGATTTATCCTACACTCCAACTCATGAGACC
CACAACAAATAGCCCTTCTAAACGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGC
AGCAGCAGGCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCC
ACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGCAGGAATCTTCTTACTCATCC
GCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAACTCTAACACTATGCTTAGGCGCTAT
CACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTC
TCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAACCACACCTAGCAT
TCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTCCATCAT
CCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTC
ACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCT
ACTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTAT
TACTCTCATCGCTACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCTTCTCACCCTAACA
GGTCAACCTCGCTTCCCCACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCA
TTAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAACATTTCCCCCGC
ATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACTCACAGCCCTCGCTGTCACTTTCCTA
GGACTTCTAACAGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCCCCACTAT
GCACATTTTATTTCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCCCTA
TCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCTAACCTGACTAGAAAAG
CTATTACCTAAAACAATTTCACAGCACCAAATCTCCACCTCCATCATCACCTCAACCCAAAAAG
GCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCAC
ATAACCTATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGTTCAACCAGT
AACTACTACTAATCAACGCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAATC
AACCCTGACCCCTCTCCTTCATAAATTATTCAGCTTCCTACACTATTAAAGTTTACCACAACCA
CCACCCCATCATACTCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAAC
ACTCACCAAGACCTCAACCCCTGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCGCTGTA
GTATATCCAAAGACAACCATCATTCCCCCTAAATAAATTAAAAAAACTATTAAACCCATATAAC
CTCCCCCAAAATTCAGAATAATAACACACCCGACCACACCGCTAACAATCAATACTAAACCCCC
ATAAATAGGAGAAGGCTTAGAAGAAAACCCCACAAACCCCATTACTAAACCCACACTCAACAGA
AACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGACCAATGATATGAAAAACCATC
GTTGTATTTCAACTACAAGAACACCAATGACCCCAATACGCAAAACTAACCCCCTAATAAAATT
AATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGC
TCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTACT
CACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTG
AATCATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATC
GGGCGAGGCCTATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCC
TGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTG
AGGGGCCACAGTAATTACAAACTTACTATCCGCCATCCCATACATTGGGACAGACCTAGTTCAA
TGAATCTGAGGAGGCTACTCAGTAGACAGTCCCACCCTCACACGATTCTTTACCTTTCACTTCA
TCTTGCCCTTCATTATTGCAGCCCTAGCAACACTCCACCTCCTATTCTTGCACGAAACGGGATC
AAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATCACCTTCCACCCTTACTACACAATC
AAAGACGCCCTCGGCTTACTTCTCTTCCTTCTCTCCTTAATGACATTAACACTATTCTCACCAG
ACCTCCTAGGCGACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCCTCCCCACATCAA
GCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCCTAACAAACTAGGAGGC
GTCCTTGCCCTATTACTATCCATCCTCATCCTAGCAATAATCCCCATCCTCCATATATCCAAAC
AACAAAGCATAATATTTCGCCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCT
CATTCTAACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGACAAGTAGCA
TCCGTACTATACTTCACAACAATCCTAATCCTAATACCAACTATCTCCCTAATTGAAAACAAAA
TACTCAAATGGGCCTGTCCTTGTAGTATAAACTAATACACCAGTCTTGTAAACCGGAGATGAAA
ACCTTTTTCCAAGGACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAG
ATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATT
GACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTAC
GGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCC
CATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCACACATCAACTGCAACTCCAAAGC
CACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGC
CATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAG
ATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCT
CGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACT
TCAGGGTCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATG$

In [73]:
import numpy
import sys

sys.setrecursionlimit(100000)

class BWT:
    def __init__(self, seq, verbose=0):
        cmap = numpy.zeros(128, dtype='uint8')
        for i, c in enumerate("$ACGNT"):
            cmap[ord(c)] = i
        self.mix3 = cmap[seq]
        tmp0 = 36*self.mix3[-2] + 6*self.mix3[-1] + self.mix3[0]
        tmp1 = 36*self.mix3[-1] + 6*self.mix3[0] + self.mix3[1]
        self.mix3[0:-2] = 36*self.mix3[0:-2] + 6*self.mix3[1:-1] + self.mix3[2:]
        self.mix3[-2] = tmp0
        self.mix3[-1] = tmp1
        self.maxlevel = 0
        self.verbose = verbose
        self.suffixSort()
        self.BWT = ''
        self.FMindex = None
    
    def trio(self, val):
        s = ''
        for i in xrange(3):
            s = "$ACGNT"[val%6] + s
            val = val // 6
        return s

    def resolveTies(self, ties, offset):
        if (offset > self.maxlevel):
            self.maxlevel = offset
        next3 = self.mix3[(ties+offset)%len(self.mix3)]
        counts = numpy.bincount(next3)
        result = numpy.empty_like(ties)
        start = 0
        for i, n in enumerate(counts):
            if (n == 0):
                continue
            end = start + n
            result[start:end] = ties[next3 == i]
            if (n > 1):
                result[start:end] = self.resolveTies(result[start:end], offset+3)
            start = end
        return result

    def suffixSort(self):
        counts = numpy.bincount(self.mix3)
        self.suffixArray = numpy.empty(len(self.mix3), dtype='int64')
        start = 0
        for i, n in enumerate(counts):
            if (n == 0):
                continue
            end = start + n
            self.suffixArray[start:end] = numpy.flatnonzero(self.mix3 == i)
            if (self.verbose > 0):
                print self.trio(i), n
            if (n > 1):
                self.suffixArray[start:end] = self.resolveTies(self.suffixArray[start:end], 3)
            start = end
    
    def getBWT(self):
        if (self.BWT == ''):
            for i in self.suffixArray:
                predecessor = (i - 1) if (i != 0) else (len(self.mix3) - 1)
                self.BWT += "$ACGNT"[self.mix3[predecessor]//36]
        return self.BWT
    
    def getFMindex(self):
        if (self.FMindex == None):
            s = self.getBWT()
            self.FMindex = dict()
            for i, c in enumerate(s):
                if (c not in self.FMindex):
                    self.FMindex[c] = numpy.zeros(len(s)+1, dtype="uint32")
                for key in self.FMindex.iterkeys():
                    self.FMindex[key][i+1] = self.FMindex[key][i] + int(key == c)
            offset = 0
            for key in sorted(self.FMindex.iterkeys()):
                self.FMindex[key] += offset
                offset = self.FMindex[key][-1]
        return self.FMindex
    
    def find(self, pattern):
        FM = self.getFMindex()
        L, U = 0, len(FM['$']) - 1
        for a in reversed(pattern):
            L = FM[a][L]
            U = FM[a][U]
        return L, U
    
    def suffix(self, j):
        FM = self.getFMindex()
        result = ''
        i = j
        while True:
            for c in FM.iterkeys():
                if (FM[c][i] != FM[c][i+1]):
                    break
            i = FM[c][i]
            result = c + result
            if (i == j):
                break
        return result
In [74]:
nSeq = numpy.fromstring(seq, dtype='uint8')
print len(nSeq)

bwt = BWT(nSeq, verbose=1)
print "max recursion level = ", bwt.maxlevel
16570
$GA 1
AAA 524
AAC 495
AAG 209
AAT 376
ACA 448
ACC 515
ACG 119
ACN 1
ACT 412
AGA 178
AGC 282
AGG 174
AGT 161
ATA 367
ATC 371
ATG 162
ATT 330
CAA 465
CAC 454
CAG 199
CAT 416
CCA 464
CCC 624
CCG 141
CCT 542
CGA 122
CGC 155
CGG 80
CGT 78
CNT 1
CTA 523
CTC 419
CTG 180
CTT 318
G$G 1
GAA 201
GAC 169
GAG 129
GAT 114
GCA 207
GCC 271
GCG 54
GCT 179
GGA 122
GGC 151
GGG 72
GGT 80
GTA 154
GTC 106
GTG 55
GTT 104
NTT 1
TAA 414
TAC 377
TAG 258
TAT 324
TCA 415
TCC 361
TCG 121
TCT 307
TG$ 1
TGA 190
TGC 123
TGG 99
TGT 100
TTA 329
TTC 308
TTG 116
TTT 251
max recursion level =  15

In [75]:
suffixArray = bwt.suffixArray

def space(trips):
    return ' '.join([trips[i:i+3 if i+3 < len(trips) else len(trips)] for i in xrange(0, len(trips), 3)])

prev = ''
for j in xrange(0,len(nSeq),100):
    i = suffixArray[j]
    if (i + 18) < len(seq):
        print "%9d %9d %-24s %s" % (j, i, space(seq[i:i+18]), seq[i:i+18] >= prev)
        prev = seq[i:i+18]
    else:
        print "%9d %9d %-24s %s" % (j, i, space(seq[i:]+seq[:18-len(seq)+i]), seq[i:]+seq[:18-len(seq)+i] >= prev)
        prev = seq[i:]
        0     16569 $GA TCA CAG GTC TAT CAC  True
      100      2131 AAA ACC TTG TAG AGA GAG  True
      200     14026 AAA CAA TTT CAC AGC ACC  True
      300     15900 AAA CTA ATA CAC CAG TCT  True
      400      1293 AAA GTA AGC GCA AGT ACC  True
      500       612 AAA TGT TTA GAC GGG CTC  True
      600       335 AAC ACA TCT CTG CCA AAC  True
      700      8122 AAC CAA ACC ACT TTC ACC  True
      800      3490 AAC CCG CCA CAT CTA CCA  True
      900      2563 AAC GGC CGC GGT ACC CTA  True
     1000     11335 AAC TTA ATA TGA CTA GCT  True
     1100     13342 AAG CCA TAC TAT TTA TGT  True
     1200      1739 AAG TAT AGG CGA TAG AAA  True
     1300     14448 AAT AGC CAT CGC TGT AGT  True
     1400     13151 AAT CCA AAC TCT AAC ACT  True
     1500      8168 AAT GCT CTG AAA TCT GTG  True
     1600     10072 AAT TTT AAT AAT CAA CAC  True
     1700     12216 ACA AGA ACT GCT AAC TCA  True
     1800      6769 ACA CCA TAT ATT TAC AGT  True
     1900      2373 ACA GCC CAA TAT CTA CAA  True
     2000      4841 ACA TCC GGC CTG CTT CTT  True
     2100      9369 ACC AAT GAT GGC GCG ATG  True
     2200     12667 ACC CAA ACA TTA ATC AGT  True
     2300      6418 ACC CCC TGC CAT AAC CCA  True
     2400     10397 ACC GAA TTG GTA TAT AGT  True
     2500      2837 ACC TCC GAG CAG TAC ATG  True
     2600     10167 ACG AGT GCG GCT TCG ACC  True
     2700       439 ACT AAC ACA TTA TTT TCC  True
     2800      9337 ACT AGG CCT ACT AAC CAA  True
     2900     14792 ACT CAT TCA TCG ACC TCC  True
     3000      9691 ACT GCT TAT TAC AAT TTT  True
     3100      7525 AGA AAA ACC ATT TCA TAA  True
     3200     15735 AGA CCT CCT CAT TCT AAC  True
     3300       511 AGC ACA CAC ACA CCG CTG  True
     3400     16317 AGC CAT TTA CCG TAC ATA  True
     3500      4410 AGC TAA ATA AGC TAT CGG  True
     3600      1197 AGG AGC CTG TTC TGT AAT  True
     3700      2643 AGG GTT CAG CTG TCT CTT  True
     3800     13257 AGT CAA CTA GGA CTC ATA  True
     3900      4778 ATA AAA CTA GGA ATA GCC  True
     4000      6095 ATA ATC TTC TTC ATA GTA  True
     4100     16138 ATA CTT GAC CAC CTG TAG  True
     4200      6774 ATA TAT TTA CAG TAG GAA  True
     4300     14937 ATC AAT CGC CCA CAT CAC  True
     4400     15790 ATC ATT GGA CAA GTA GCA  True
     4500      4688 ATC CTC TTC AAC AAT ATA  True
     4600     15019 ATC TGC CTC TTC CTA CAC  True
     4700      7313 ATG ATT TGA GAA GCC TTC  True
     4800      4975 ATT AAA CCA AAC CCA GCT  True
     4900      2405 ATT ATT ACC CTC ACT GTC  True
     5000       243 ATT GAA TGT CTG CAC AGC  True
     5100     16042 ATT TGG GTA CCA CCC AAG  True
     5200      8675 CAA ACT AAC CTC AAA ACA  True
     5300      2211 CAA CAC CCA CTA CCT AAA  True
     5400     15110 CAA CTA TAG CAA CAG CCT  True
     5500     11357 CAA TAG CTT TTA TAG TAA  True
     5600      3154 CAC AAA GCG CCT TCC CCC  True
     5700       572 CAC AGT TTA TGT AGC TTA  True
     5800     14811 CAC CCC ATC CAA CAT CTC  True
     5900     15346 CAC GAA ACG GGA TCA AAC  True
     6000      5603 CAC TCT GCA TCA ACT GAA  True
     6100     13198 CAG CAG TCT GCG CCC TTA  True
     6200     16300 CAG TAC ATA GTA CAT AAA  True
     6300     12072 CAT ACA CCT ATC CCC CAT  True
     6400      6517 CAT CAC TAT ACT ACT AAC  True
     6500     16497 CAT CTG GTT CCT ACT TCA  True
     6600      7275 CAT TCA TTT CTC TAA CAG  True
     6700     15989 CCA AAG CTA AGA TTC TAA  True
     6800     12484 CCA CAA CAA TAT TCA TGT  True
     6900     11141 CCA CCT TGG CTA TCA TCA  True
     7000     16458 CCA TAA CAC TTG GGG GTA  True
     7100     15979 CCA TTA GCA CCC AAA GCT  True
     7200     11140 CCC ACC TTG GCT ATC ATC  True
     7300     14619 CCC CAC AAA CCC CAT TAC  True
     7400     10935 CCC CCT AAC AAC CCC CCT  True
     7500      8843 CCC CTT ATG AGC GGG CAC  True
     7600     13129 CCC TAG CAG AAA ATA GCC  True
     7700      1277 CCC TGA TGA AGG CTA CAA  True
     7800      9865 CCG CCA ACT AAT ATT TCA  True
     7900      4396 CCT AAA GTA AGG TCA GCT  True
     8000     10564 CCT AGA AGG AAT AAT ACT  True
     8100     10612 CCT CAA CAC CCA CTC CCT  True
     8200       590 CCT CCT CAA AGC AAT ACA  True
     8300     13976 CCT GCC CCT ACT CCT CCT  True
     8400     16393 CCT TGA CCA CCA TCC TCC  True
     8500      7214 CGA CGT TAC TCG GAC TAC  True
     8600      1536 CGC ATT TAT ATA GAG GAG  True
     8700     10584 CGC TGT TCA TTA TAG CTA  True
     8800      8996 CGT ACG CCT AAC CGC TAA  True
     8900       727 CTA AAT CAC CAC GAT CAA  True
     9000      3345 CTA ATC GCA ATG GCA TTC  True
     9100      7825 CTA CGC ATC CTT TAC ATA  True
     9200      3873 CTA GCA GAG ACC AAC CGA  True
     9300     13180 CTA TCA CCA CTC TGT TCG  True
     9400     10613 CTC AAC ACC CAC TCC CTC  True
     9500      9332 CTC ATA CTA GGC CTA CTA  True
     9600     15142 CTC CCG TGA GGC CAA ATA  True
     9700     11857 CTC GCC TTA CCC CCC ACT  True
     9800      1241 CTC TTG CTC AGC CTA TAT  True
     9900     11739 CTG CCT AGC AAA CTC AAA  True
    10000      3297 CTT AAC AAC ATA CCC ATG  True
    10100     13441 CTT CAA CCT CCC TCA CCA  True
    10200     13959 CTT CTT ACG AGC CAA AAC  True
    10300     10923 CTT TTC CTC CGA CCC CCT  True
    10400      6698 GAA CCA TTT GGA TAC ATA  True
    10500      1857 GAA TTA ACT AGA AAT AAC  True
    10600      2982 GAC CTC GAT GTT GGA TCA  True
    10700      2144 GAG AGT AAA AAA TTT AAC  True
    10800      1603 GAG TGT AGC TTA ACA CAA  True
    10900     13915 GAT TCT ACC CTA GCA TCA  True
    11000      2536 GCA CCG CCT GCC CAG TGA  True
    11100     13758 GCA TCC CCC TTC CAA ACA  True
    11200      5251 GCC CAA ATG GGC CAT TAT  True
    11300      9914 GCC GCC GCC TGA TAC TGG  True
    11400      2875 GCG AAC TAC TAT ACT CAA  True
    11500      4769 GCT ATA GCA ATA AAA CTA  True
    11600     11681 GCT TCA CCG GCG CAG TCA  True
    11700      6578 GGA GAC CCC ATT CTA TAC  True
    11800     16455 GGC CCA TAA CAC TTG GGG  True
    11900      8871 GGC TTT CGC TCT AAG ATT  True
    12000      8164 GGT CAA TGC TCT GAA ATC  True
    12100     15178 GTA ATT ACA AAC TTA CTA  True
    12200      6971 GTA TTA GCA AAC TCA TCA  True
    12300      7774 GTC TGA ACT ATC CTG CCC  True
    12400       170 GTT CAA TAT TAC AGG CGA  True
    12500      4779 TAA AAC TAG GAA TAG CCC  True
    12600     10443 TAA ATT ATG ATA ATC ATA  True
    12700     15749 TAA CCT GAA TCG GAG GAC  True
    12800     13273 TAA TAG TTA CAA TCG GCA  True
    12900      5054 TAC AAC CCT AAC ATA ACC  True
    13000     10793 TAC CAC TGA CAT GAC TTT  True
    13100     10166 TAC GAG TGC GGC TTC GAC  True
    13200     11237 TAC TCA TCG CAC TAA TTT  True
    13300       619 TAG ACG GGC TCA CAT CAC  True
    13400     10596 TAG CTA CTC TCA TAA CCC  True
    13500      8740 TAG TAT CCT TAA TCA TTT  True
    13600      4208 TAT ATG ATA TGT CTC CAT  True
    13700      3559 TAT GAA CCC CCC TCC CCA  True
    13800     16014 TAT TCT CTG TTC TTT CAT  True
    13900     13291 TCA ACC AAC CAC ACC TAG  True
    14000     14034 TCA CAG CAC CAA ATC TCC  True
    14100     13813 TCA CTT TCC TAG GAC TTC  True
    14200      7796 TCA TCC TAG TCC TCA TCG  True
    14300     13021 TCC ACC CCT GAC TCC CCT  True
    14400      6151 TCC CCT AAT AAT CGG TGC  True
    14500      9729 TCC TAC AAG CCT CAG AGT  True
    14600      6234 TCC TGC TCG CAT CTG CTA  True
    14700      7329 TCG CTT CGA AGC GAA AAG  True
    14800      5725 TCT ACT TCT CCC GCC GCC  True
    14900     12523 TCT CGA ACT GAC ACT GAG  True
    15000     11623 TCT TCA ATC AGC CAC ATA  True
    15100      5033 TGA ATA ATA GCA GTT CTA  True
    15200      8695 TGA TAA CCA TAC ACA ACA  True
    15300     12234 TGC CCC CAT GTC TAA CAA  True
    15400      7884 TGG CCA CCA ATG GTA CTG  True
    15500      2625 TGT ATG AAT GGC TCC ACG  True
    15600     12026 TTA ACA ACA TAA AAC CCT  True
    15700      9508 TTA CCA CTC CAG CCT AGC  True
    15800     10226 TTA GTA GCT ATT ACC TTC  True
    15900      5542 TTC AAA GCC CTC AGT AAG  True
    16000      4590 TTC CAG TTC TAA CCA AAA  True
    16100     12943 TTC TAA ACG CTA ATC CAA  True
    16200     15013 TTC TTT ATC TGC CTC TTC  True
    16300      9936 TTG TAG ATG TGG TTT GAC  True
    16400      8757 TTT ATT GCC ACA ACT AAC  True
    16500      6089 TTT GTA ATA ATC TTC TTC  True

In [76]:
bwtString = bwt.getBWT()
print "BWT = "
linesOf64(bwtString)
BWT = 
GCTAGGTCCCCGAATAGACCGAATATTCCAATCGGTACAAAAAATAACCCACAGTTAGTGAATT
AGGAATCGACGATACGTTAACGCAAACTGCCTTTTCAGCCCCTGATTATTTAAATCTATATAAT
TGCTAAAAAAATGAATAAGCCGACAGTATCATTATGCACTCAGGTAACACTAAAATAAGTACCT
CCACGACCATAATTTTAAACCAGCCCACAGGTCCTATTGCGCCGAGACAATTCACACTCCAATC
TCACATTAAATCCGTGCCCCTTTTGCATGAATACCTTAGTACATTATCCACCAACAATCCAAAT
ACTACCACAACACGCATGATACCCCCCTACCAATATGTCGCACCCCTCTCCTCCGTTCCAGCAA
AGCCTTAACAACAGTACTATCATATGGTCGTACAAGACTAGTTGTCTTATGATACTATGCTATC
CCCGATTCGCCTAACCCAAAAATTCTAGATACCCGACAAAAACTGGCACCGAATAAATTTTACA
AAGGGATGAATCATTACCACTTCTCACACAAACATATGTTATAACCTCTACGGATCGACCCTCC
ATCACCTAATCCCGTCTCCCAACAATGCTCCTACAACTCGCTCGGTTTACAATCCTATCATAGT
TTCTGTTTGCGTGTGTATGCCTCGCCCGAATCCAAAACATCTTACTTACTACTAACCTTGAACA
TTCTGGCTTCACAACCCTCTGTTAAGTCGCCTGTATACATTCTAAGCCCGTTAAAAAAAACAGA
AATTATTCAAAGAGCCTCAACACAGCAATCTCAGCTGCTTCAGAGCTAACCCGCACCTCATCAT
TAAAGAATAACCTCTCACCTCTCCTTCTTATTGATAAACCTCATTCAGTGAATCCGGGCCAGAA
ATACTCACACCCTCCAACCAATATCGAGCAACTAACATTGGTAAACAAGTCGCGCCAATAAACA
AACAACTCCTAACGAGAGAAGACTGGCCGACAGTCAACCACCTATCTAAATTCAACTTTCGTTC
CCTCAGAGGCCCGAATAATTCGCCACCATCTAACGAAGATCAAACCATTATTGCCAGCAATCCC
GAGCAACCCCTAAAAAGGAGCATGGACGCCTCTTGTGATGCAGAACAAATTTTACACTGGATGT
TAAAGGCAAAACTCCAACCACAGTAAGGCGTAATGCAAATAAACCCCCACACCCGCGAACTAAA
TATATACGTAAGGACAAATTAACTCAATTTACAAAATTGGGATTCTGTTTTAGTCGTCACTTGC
GTCTTCAGTACCATACGACTCAAGCACCTTCTACTACAACAACTATTCCTACCAACCAACCTTA
ATCATCCGCTTTCAAGTCCGTTCCAACTTCATAGACATCACAATTTCTTTGTCTTGTTTCATAA
AATCACTAATCCCCACTTCTGGCTTTCTAAGTCTATATCACCCTCACTTAAGGAAAATGTCTCG
TACCACTAATTCTCAACGACGTCAGAACCTCTTCCGCAAACCGTCAAATCTGCCTTTATGTCTT
CTCCATTACAGCCAATTATCGATTTCACGCTAGCATATACACATCCCGTATATATCCAACTTGA
TCATAACGAGTGTAAAACCTACGCTACAAGCCAATACCAATCTAAGTACACACACCCCGCCCGA
AAACAGACCTGGCAATATTAAATAACGATCATCATCCTCGCATATGAACGAGAACCAATACTAC
ACACGTATTAGACAGGAACCAATGTTTAATTAACTTATACCCACTCGACAAAAAAACATATGAC
TTATCAATCCCAAACCGATCCTATAAAATCCCCCTCTACTAATTCACCCGCACACCCGAATTAC
ATACCCTACTAGTATCCAAACCCCTAAGAAACTCCCCACAACCAACACCCAACAAAACACCTAC
TCAAAACCACCAATTATTGCATCACCGTATTTTATCAGCCACAAAATCTTCCCTTAAAACTAGC
CCTCTAGCAAGCGTTGGCATAACGCCAACGCACCAGCCCTAAATCCGAACAATCACAACAAGTG
ACTTAATAAAAACGTTCCAACTTCTAAATAAACGGTGGCCGATAACTCCGCGTGCCATTTTCTG
AACAGGCCAGCATACATTCGCCACCTACATAATCTATCACAAACATAACCCATATCAATCACTC
TACATCCATATCTAACACAACAAAGGCTCATACCCCTCAAACAGTACCATGCCATTTCCACGTA
AACACTATTTAAGAAACTATACCACCCAAACGCCATGTACTCTAGAAGTAGACACACTAAAAAA
TTAATGTCGTAATCCGCACAACCGTCCATAACAACACTCCCGAGCCTCTCCTCCTGGCCAACCT
CAATTCCCCCACACCAAAACCCCCCTAGTAATAACTTAGTCATTCCCACTCACAGCAGCCAGTT
TCATAATGTTTGATTGGATCCAACAATCCAACCAAAGCCTTAACAAAACGAGCATTCCCTCGAT
ATTAACGAGCCTGAGAAGCACCTATCCCTCAAATCTCATAAAGGAAGCGCACAGTCAATTCGCA
ACTCTAAACAAACTTAGGTATCCTCATCCTCCGCTTGCGTTTCACACCTTTACAGCATTTCAGA
ATCGTTTTGTAAACCACGGAGAACCTCTGATATCCTATGCTTACGATCGCGGTGGATCCGTATT
ACACTTTCCTATAGTTTTCTAGAAAATACATTTAACCAGTGACATTTGATCGGTACTTATGATT
GTTCAAGACAAAGTGTTTTTAACCATGTCTACTGGACTACTAGCAAACTTATTTTATTGATCCT
TACGATACCTTTGATAATCTGCTCCCACCGCCGATATGTTAAATCGATCTTCCACCTCGCACTT
ACTGAACATTCTCTACTAGTCCCTAACTATCCCTGACACTTCTGTGTCTCTAGTGTTATTATGT
TGCCCTGTTAAATAACTTCTCTCACCTGTATTCGTAACAACATCTCCCGATTCCAACTGTCATA
TCACCAAAACGAAAATATTAAACACTCCCCGTCCTTGGTCACCTTGATTTAAGTTGCCTTTTCC
ATATCCCACACACCGCCACATACAACTGTGTTGACTCCTCGAGTTGATATTTAACAAGGTACAA
ACCCAAGATTTGTGTGCTTGTACCGGTTATTACGCTATACTATGTCTACAGGAAACCCTTCATC
CACCACACGTGTAATTTTACAACTGAGTACAAATTTATACGCCGTACATATCAATAACTATAGC
CATACAAACTACGCTGCTTTTACTTATTTATACAGCCGGATCCCTATAGAATACTGTGATCACA
GTTCTTTTCGGCCCTAATACTGGATTTATTTTTCTTTTGTGGATATAGTCGTCCTCACTCCCCA
CTTTTTCCACTCCTTGTTACTCATATAATAGTAGTTGTCCCGCGACCTTATTTGTACCGATACC
TACCTCTCAAATGAAAAGGTGCATATGAAACCAAAACGCAACGTCAATAACACAGCTATCTAGA
TCAAGAACTGCTCTATAGGGTAGGCTCGTTTGAACAGTTTTACTAGCAAACTTTTACTAGATCA
AAAGTCCTAATTGGATGTTTGTTGTTCATCTTATCCCGCACTCAGCAGCCGACTAATCGTGTTA
TGTTGATCGTCTTTAGGTACGTTTTTTGACGCCTGACTTGTCGAGTCTCAGCGCTTCTTATACG
TACCTCTGAGTACATGGACCCTGTACTGTATCTTACACGATCGCGATTCCATTCTCCGTTCGAC
TCTGTAGTTTAATTGTCCTAAACCAATCACACTCGCAGGAACCGTTCATCTCTGCCACATCCCA
CGGCCCAGGCGTAATAGAGACATTATGTTATTGGTCCCTCCCAGTTTAAACCCTAACCTCACCG
AGACAACCCAAGATCATTACCAACCTCACCTTAACCCGGTTCTGACCACCCTCTAACACACCAT
ATAACTTAACACCCCCACACGCCAGCCCCAAAACCCACCAAACACCATTACCTCACCAATGCCA
TGTACTCTACCGCCCCTTTTCGTAAGAAGAATATTCAATTCTCATCCCACATATGCCCTCCGAC
TTCCACAAGTTCACGCAATGATTTCTTCAGCAAAACAGACCAATAAGAAAAAACACTTCTAAGA
TCATTCCTAAACCTAAATTTATCCTTCCTATACGTAGACTCCAGACGATTTACCTTGCAAACTT
GAACCATATCGGAAAATTATACAGTACAATCCCAGACCCCGGACTAGCAACCTAATACTAAGAC
CACCCCAATTATCATCCTCCTTAACGAAAAATCCCGCCTTCCCCAACAATTCGCCCTAACACAA
ACGCGTACCACCATCTCCCCATTAATCACCCACCCATTTCACCCCCGTCGAAATCCATCAAGCA
ACCATCTTTAAACCTGTATATTACCAAACACTCTTGCTCCCATCTCCTCCCGGCCTAATTCATC
CACAACCAACCGCGTCTCAATTTCATCTCAATCCAAATCAAACCCCCTACCCATAAACTGCCTC
ATCATACTCTACTCCTTACAGATTATCCCACCATAATAAATTACATTTACGTAAGCTCGTTCCC
TAATAATAAAAACCCACAATCCACTGTTGCGTCTGTAATAGTGCATATCAAAAAGCCATCATTG
CCTCCTCCGTGCTATGTAATACACACTACACGCTCTCTCAGCCGCACGAAATTCCACCCACATT
AATCTCAGACCTTCAGTATCAAAACACATCTGTGTTTTTTCTCTTTACCTGAACGGACAAGCGT
GTCATGTCCCATAGTCGCCAAAGACTCTATAAAAAACTGTCATAAATCTTTTCCCTTCGTTCCC
TGTGTCCTTACAAAAATTATCAGAAATTCTCTAACACACACTCCCTTCCCCAAACCCATTCTGT
CCTTAATCCAACGCCTTTCAACGTACTTTTCCCAGTGCCATTACCCTATCCAACCTTTCTTCTT
AATGTAAGAATTATTTCTTCACCACACATACACTCTCCGTCAACAACATGCTTCCTTCGTTAGC
TGTGTATTGTGATCGTCAACTGCTTGTATATGTCCATAGTATACGCCATCATACATCTCCACTC
GGTGCACTACTAACCAACCTCTACTAGTCATGAATAAGCGCACCCCCTACAAAGCGCCAAGCCA
CCCCACCAGTAACCCATATTTAGTGTCTCAAACCACTCCTAAACTCATTACCCTTGGATTCCCA
ATCTCCTCAGGTCACGACCTTTATCCCAATTTACCTTAAACCATAACCCACTTTATCCAGATAG
CTAAATCAATCATAATCTTTCACAAGTAATTATACAAATCTTCCATTAACAGTAATAAGTTTCC
TAAGCTTCTCTAACCCCAACTTATGTCCACCGGTTGTTCTTACTTTTGAACCACCCGAAACACA
GATTTTCAGTGTCCAGAATACGTGACCGTGGAGTCTTCAAAGGCCGTGCCTCTGTGTCAAATCA
TTAAGTCTGTATCAAGACTCTCTTACAAACAGGTATATCGATATAAATCACCGCTCGCCACAAC
CTTCCAAAGGTAAATCTTATTATAACACAACATTCCTTTCTACACACCCCATTGCAAATGCATA
AGACTACTCAGTCTCAGAACAGCTTATTCAGTCCGCTCCTCCCAATCCATTGTTTCTGCCACCT
TCTGCCTAAGGCACACACTCCAACCAGCTAGTACGCAAGCTTGAATTGCGACCCTACTCTAAGT
TTTTATTACTACCCTCCTTGCTCGCCATCTAATAATTCTCCACATTACTGTCAGCCAATATCGC
CACTTAATGTCAAATCTCTTTCCACCCAATAGCATATCTGACGCAACAATCACCTTACTTCACA
GTTCCCTCCCTTGGACCAAAACCTAATGTTCGCCGCCTCCATACCTTAACCCCGCCACAGATAA
CATGCTTCCACACAAACAAGATCCTACAGCATAACACCCCCTGTTGTTCCAACACTGTGGGCTT
GAAACACTCTCTTATCGATCTTCCTATGCGATCGACAATTCCAATATAACGCACTTGGACTTCT
AGTTCGTAAGCAACAGAGAAGAACATGCCTACCTGATATCATAGATAGGCATGCTAGAAGATAA
CAGACATATTGGGTAATTGGGCGGGAAGGACTTACGAATTCAAACAGAAGGCCTGAAGAGGTGA
GCCAAAGAAGCAACTGCACCCCGAGACGTCGTAAAACACGACCGCATCCATTGCACACTAACTG
ATATTGCCGGCCTTCTTGAGCGAATTCGTTGCACACTGCCCCCTACCCGCAGTAAAATCTATTA
TATATATACCCATCTCCCATTCATAATCTAGTAAAGCTCGGAGACTTCTAGTGTAGTCCATAAT
GAAACGTAGCCCCCCCCTGTCACTAACCCATCTCGGCTGGCACTTCTTTTAGACTCTTTATCAC
CTCGTATTTTCTTCCCATTAACCAAGTCTCCAAACGAATTCTCTTCAACACTAACTCGTCTGGT
ACTCACCTTACAAGCCTCCCATAACCCTTAGCTAACGATACATCTATAAGCGATGCCTACTTTA
ATCTATTTTTGCGGCGACTCATCTTTCACTCTTGATCACTGCGATTGCCGCCACTTCGGCATAA
GTAATCACTCAGGACTTTCCCTCAGCAGAATAAATCTCTATTAGCCTCAGTATAAACAAAGACC
AGCTAACTTAAAGCGCACGCGACCCCGCAATTCCCAAAATATTTCACCCTCCAAACGAAGCAAA
AATTGAATACCCCCACCACAGTGAACATACGAACTCCGACGTCGCGAGCGCAGCTCCAAGAACT
CGGGCACGCAGTAACAACACCTGTTACCGCTACCCTTGCAGGTCTTGAAATACTAATACATTCC
CACCCGCACCCACCTCTAACACATCCCCAACCGTTCCGAGCGGACAAACAACTTGGAGCATCCG
AACTAAAACCAATCTTTCGTCTACCAGTGTCATAACCTCCAACGCGCGCACATATCCACGGAAA
CCTTAGTTGGAGACTGGCCCCTCACTACGCATCATGAAACTACAGTCCCCCGCCCGCACCAGCC
GCCCCCGCAATTAGTAAAGCACCAGTACACACAGATACAGACCACCAAAGAAACACGACGAACA
CTCAGACGAAACACGCAAGCACCACCTTACCCCACGACGTGACACCCTATATCCGGGCATAAAA
CTCGGCCGGTCGCTCCAACTAACCTACGCCCGCGGGCCCACGCCACCCTCCACCAGCGACCTGG
CTACATCTCTTTGTCATTTTCAATACACAAAACTATCTAAATCCCCTTACAACCACACTCGCCG
CCACAAACGGAGATGTAAAAGATAAACAATGAGTATGTTTAAAATCACACCATATAAAAATAGA
CAAGGCCAACCTCCCAAACCGCTTGCCACTAGAGTGTCCATTTCAACTGTCACCCCCCCCAAGT
CCAACTCAATAATACCACGTCTCCCTGCACCCGCCACTCTTCCCAGCCATCCGCCCTCGCTAGG
AAGTCCCGTCATCACAAAATGCCATTGTACCTCCAAGTCGACACAGGACAAAAGCTGAAACGAA
ACCGTAAAGCGCATACATATCTACCGGAACACGCCCCCTACGAAAAAGGGGAAAATTACCTCTC
GCTCTTCCGCACTCACAATGACACGCTTCTCCAAGGTCAAGTACGCCCTGCTACCCAGTTACCT
ACGCACCAATGGCCACTCTGCTCAAACCTAGTCCTCCCCCAAACACGGGGCTACTATCCTACAC
ACCGCGGAGCAGACCAAACATTTAGAATCTGTTCAGGGACTTAGAAGTATACCAAGCTTACAAT
CCCTTCATCCATACAGACACAGATACTCTCGCACCCTCCCAGAGTCGCATAACCCCTTTCTTTG
TTTGCCACCTGTATTTCTTCATCTGGATTAGACCAACTAGCGCTCCCAGACCTCTATGTCTGTT
GCTCGTCGGCTCCTACCGACGCTAGCAGTGTGTTTTCTCCTTATCCTCTTATCCCCGGTCGTAC
GTATGCCATTCGCCCCATAAGTTTCACTGATTACTACAACAGAACTTGATCTCGTCACCCGATC
TGTCAGCCGCCTATGTGTCTCTTCATCTAAGCCACTCACCACTTTAAAATACCTATAAGTCCCA
TTCATAAGACAGACTCGCAATGTCATAACCCCCCCCCCCGCACCAATCACATCATCTCTACCTG
CTATAAAAGCCCCACCCCACAGCCCTGAAAGTGCACTAAACTGGATATTGGAGTGTCAGCCAAC
GCCAATAGCGCGCCTACTTGTTCGCTTTCCGCCCAGCCGAACCTCCTCCGCATGCCAGCCACCG
CCGCTAGGCTAGAGATCTTACTACTATTTAAACAAGCAGCTCGCAAATAGAACCTCTTAAAATG
CATTCACAGCTTTGTTTCTTCTCCACTAGAACCAACAGCTAATACTAGGCTCCCCTGAACAGTC
CCGAAATCACGAAATTACTTGGAGCACACACCCGTAACACAACAACACGGGGTCCCTTTTAATG
CATTTACCCTGTCCTTTCCATGCCAACATCGATACCCTTTACCAACCTATAAGTTACCCCCAAT
GAGAAACCGCACTTTTGACAGCCGCCCCCTCTGACCTCCAAGTCCTATTATCTTCATTTTTATC
TTCCAACTCCCGATATACCTCTTCTCATTCTATACTCTTCTACTCAGATTTGCCAGAAAGGCTT
AAACCCCAGGAGTTGGTTTAAGGCATACGACTCCTACACTGCTTCCCCCGTCGCATTCGCTGAC
CTTACCCACCTTTCTAAAGATCAAACAATCGTAACACCAACGCGGCCCATAAACGCTACCACGA
CGACTGTGCCCTACGTGACATCATTCCCAGAGTCACACGAGAACCCCGCATAGTCAAACGGTCC
GGGATGGAGTAGCGCCACAAACACTTACAAAACCCAAAGCTACAAAACCAACCCTCCTAACTGC
AATATCTATACCCGTAGCCTCGTTGAAACACAACGTTCCCTCTACCCTTGCATAATAGTCCCGT
ATGAACGACAGACCCACATCAACCACTCACAAAGAACCTACAATATCTAAAATGCTCCTCTCAT
AACGCCCTGGTCCCCGCTATCACACTCCCCTACCACCCATTTCCTCCCACCCTCCCACCTCCCC
CCACCCCCTCCCAGACTACTACCAACCCACATCATCCCTGTCAAACCAAAACAATACTACGTAA
CACAGCCGAGCCCACAACACGTTACCAGCAGGCACACAGAAGTCACCAACATCAACCGAAAACG
TGAGCAGACAGGCTTCCCCAACCCCACAACACCTTCCAACCTATACAGCAACAACGCACAGCTC
CCCACCATGTATCCCCTATTTCAAAGCCGCCACAGTCACCTAGTCACACCTAACAGCCGCGACT
CTTTACATCCCATTGGCTTCATGACTAACCTTTACAGCATAACCACATAAAAACATCACAGCAG
CACAAACCTTAATATTATAAGAAATATAATCATTCCCTGCTAACTATCCCCCCCACCCATTCAT
CATTTCTCCTACACCCCCTGCCTCTCATCGGCCTACACCAAACTTCCACCCTCCCCCAAATGCA
TACTTCTCATAAACCTTTACTAAGCTAACTTGAGAGCCCCCCACCCAATCCTCCCCATGGTACC
CACCACCTACCCCCTCCTTTGACCAACGTAGCTATAATCCCACACGCGGCCATTATAATCGTCC
AATCTCGCCCCCATGAACCTTAAACTACACCCGCCTCAATACCGCAACCGTCCCCGCATCTTTC
TTCTTCTCCTATCCCCATATAAATCCAGTTCTCACGCACCCCCTTCCTCCCATCGATCATGATT
GCCAGCGCCGAATCACTCCAAAGTGAGCCTGGATTTCCTCGACCTGAGCGCGGCTATTAAGGTT
CCTCGTGAAACAGACCTCGCCAAACCCTCCTACGCGCCAAGTCCCCTGTGGTCACCCCACATTG
TGCTCACTGGCATGTTTAATTGTTACGAATACTCCTACTGTCCCGATCATTGATATGAATTACG
CAGTACTCTGGACGGCAGCGAATTTGACCACACTTGATGGAATCTGGTTGCACTCACCGATCAA
CATAACATTCAAACTCCAGACGTACTTCCTTTGCGCACAACGTTCTCATTACCCATTCCCGCAA
GCTAGATCATTATCTCATACCCTAAACAGCGCAATTTATATTTGAAAAACCCCTCGGTCCCTAC
GGTCGGTGTAACATCTAATATAACCATCCTATGTTCCCCCAATCTTGAAACGGCCAAAGTATAT
TTAATAAGTGGTAAAGCAGTAGTTCAACTCATAACCTCAAAATTCCTTCTAAACGCATTGTATC
ATACAACAATGGGCCTACAGACGTATTAGATACCTAATAAGTGCCGCGGGTTCGTAAGTTTAGC
CGTTTAGATACCATTCCGCTTGAAGTGATGTATATACGTGATGTAACGTACTTAGAATCAAGCA
CACTTAAATCTATTACCTTCCACATATACCAAAAAACTAACCTCGGCCGGAAATTAAAAACAAA
GACATTGTCGTGTGACAGATTTTTGGTTGTGCGTTTTGCTTAGTGGAGTGTTTACCCATAGCGT
GTTGATGCCAGTAGGGCTAGGCGCCCTCATTCAAATTTCGGCGATTTCTTGTCGGTGAGAGGAG
AGTTATTGACGAAGGAGGGATTCCTTAAAAAGCAATCGATGGCTGCAAAAATTTTCCGGTTGGG
AATCGTCATCCTAATTTCTCTATCGTC$ATGTTGTTCCTTGTTTTTCAATTGGTCTTACAGATC
CATCAAGCCTAATGGTGCTAGCCAAGATTAATTTGTATGTTCACTCAAAACTCGGAAGACGTTT
GGACTGAATCACCAAAATCAAAACAACTTCCAAAGTGTGGAGTGATACAAAAAGAAGTCCTTAT
CCTACTACCCAGATTAACAGAACAACAATTGGTATAAACCATCATCAAGAAACCACCAAATAGC
CGCAAATGGTAAATAAGTAAAGGCCACCCAACGCCCAGTAAAGAGCAAGAACTTGACATAGATA
AGACAACATACAGAAGACGAGAAACAGTATCAACATTGACTGTAATGTCTACGACAAAGAAAAA
TGTAAATCGCCAAAATAGCGCTAAAAGAGTGGTAATATTATACTAAAGTTCGACTCAGCTAAAA
AAATAAATCACCATACAGGCACCAATATTACAACTCACCTGAAAGCACGAGAAGCCACAGTAGA
GGTTGTAAGAGACAGTGGATCCCATATATGAGGAAATCGCCTCGCCAGCAGACCCCACAACGAA
AGCAACAGATGGGGTTTGGCGCAAGAGGGATAGGAGAAGGGTGAGCGCGAGGCGGTCGAGTAAA
AACGCCACCTCCTAAATAAACACGGCGGCAACCCAGGCATCTTAATGCGACAAAGCGCAAAAGG
ACGAGGGTCTTGCGAGCTCGATTCAAAATTCACAGCAGTACAACATACAAACACACACAGAAAT
AGTAGGGTCTTATAGGATTTGCCGCAGTGCGGACTTTTGAGGGAAAGCAGGGATAAACTAAGAG
CGCTAACAAAAAAACAAAATCTGAAATAGTACACCCAAAAACCAGACAGGAAAACATCCCTTAG
CAATCAACTAAAACACAGTTGAGTAAGTCAATTTAGACTAAACAATTTAAAGATAAGTTTCAGA
ACATCACTTTATTGTAAGTGTGTTGGAAATAAGGGCATAAAAAATCGACTTAATACGAAAAAGA
GACTCCCTTTACAAATTCCGCATAAACGGAATCACTGAGCATATTCAGACAATAATTCATATAC
CCAATGTATTCCGACCTCACGTGTCCAGATTGCATCCGCTAAAATTTTCGGGGATGGGTACGAT
AAAAAAACTATCGCCTAAGTGTGCAATTAAAGCTCCAGTGGATGAAAAGTGTGTTTCCAAAACA
AAAATAGTAACGTGGTCACATAGGTAGCAATTACCATATCCCTTATACAGATAACAAAGAAATA
AATGACCAAAACCAACGGTATGAAATCTCCCCTAAGATAAACACTGATTTGATTATTCGATTAC
TTAATAACCTATTAAACGCGAAGAAATACATGGTCGTCGATAACATATTGACTATGGGAGAGGG
GCATAACGTACGGTCACATATAAGGCAAATCCGCAGACAACGAATATACTTGTATTGACTAGAG
CAAAGTTATGTACGTCTAACGGCAATTACAAAACTAAATGCAGTCAATCGAGCCGTTAATTAAG
TAGGGGAAAGTAGAGCATAGAGAAACCTTACGAACACTTGCAAATTACCAGAAGTTTCCTACGT
TTTTGAACTTATAGCACAAATGGTAACCCGAATATAGCTAGAGGACGGAGGGGCTCGAACTTGC
AGTAGAGCCTTCCTGACCACATTCCAATCACGATAGACAAAATAATCTTAACCTTCTCCTAAGC
ACCAATCCGACTATATGGGTACAAGCTTCATCCTCTCGTCACATAGATCCCCGGATTGAGTCAC
ACTTCCCCCACTATAACCTCGCTCACACGTCATGACCTTCTCGCGCCACTTCCAAAGTTCTACC
AATAACCCCCCTCACCCCATACACCCTTCCCTTTTGATTTCCCGTTTCTATGCTATCAATCCAC
CCTCTCGCCCCCGAGGTGCACCAACATGACAGTGCAGAAGCATTCCGACTCCGACCACAACCAC
ACATAGACACACCCCCATAAACTAATCTCGATCAAAGCGAAACCCACCACAATACTCTTCATTC
TTGACAATACATCTCACCCTCTTATACACCTACCCAGCACACCTCTCTTCTTAAACTCCGCTAT
AACCCGTAACCCTTCATCAGTTTTTTTGAGTTGCCGAGCTCATTGTGACTTCAATAAAACATTC
TCGTCTCTCCCGCTTCTTCCTATATCGGCAAAGACTACTACAACTTCCCGCATTTCTACTTCGC
ATTCTCGCTCGCCCCATTCCTCTACACTCCATTAGCCCCTTTCCTCACCCCCACAGCATTCCAC
GCCCCCCCTCATTCATCAATACCCAAGCTGATTCAAGATCACCCTTAACAAACAACCGCTGTAC
CCTACCACTACCCCATCCCCCCCCCCCGACCCAACCTCATTACCTACGAATGCGTTGACTACAT
ACACACATGCCCCCTAGTTTAACACCCACACTGAAACACACACGGCCCCGCCTAACCTGCACTG
ACGTATGTGCTCACAATCCATAACCCTGATCTAGTCCTGTTCCCAGCTCTCGACCACCCGAAGC
GCCACTCCAAAAACCATCACGCGAAAAGTTTCATGCGACACCCGCCGGCCCATAAACCATCAAC
CTGGCCGCCCCATCACACGGTAACATAACCAATGTGTAAAGCTCAGAAAGAATACGTTGCCTCT
CCACCCAATATACTCCTAAACACGAGTTTTCAAGTGGCAAGACATTTACCCTCGCTTCCAGCTC
ACACCCTGTTTATACCAAACACTGCAGCTGATTCCTTTACACACTGATCCCCCTCATACCCCAC
TAAATTCCCTTCAACCAGCCTTTCATCCGATTTACGCATTAACACCCATCCTCCATCTGACATC
AATACAGTAATCTCCTCAACGGGCCCCAGAATCGTCTCTCCCAGATACCCCCATTTATTCTCAA
CGCGGGTCTGATATTCCTTAAACTCTCCATCCTCCTCCTCCCCACCAGTCCTTCTAAAAAAGTG
CCTCCCGAACAGAGTTCTGCATGACATTCCACAACACCTAATACTTGTCGGATAGCTTAAAAAT
ACCCACCCTACAATTACGCCCAACCGATGGTACTCGCTAAACTCTCCTACTTATCGCATAAAGC
AGAATTCCATCCATCCTGCCTCAAAGTTCTAACCACTAACACTGCACTTATGGCAACCACCATC
GCAACACACCATCCCTATCCCCACGAAACCGAAGATAGATTTAATAAACAAAACTACTCCAATC
CTCCACACTCCCCTTCTATAGACCTCTCAGCCGATAACCACCACGTTCAACCCTACCAAAGTAC
TCACCACAAATCCCAAAAATATTAATCCAAACCTTCCAACCCACCAAAGTTACCCAACTATCCC
CATTGAAACTTACCTTCTCATTCCAACCTGATGCCTCTCACAACGACTACCCTAAATGAATAAA
ACGCCCCCCACTGAATACGTATCTATCTCAATCCCCCACCTTCTCCCCCAATTCCCCCACCGGA
CGTACCCCCAAAAGTGCCTCGGCACCCCTCTCCGACTAAAACACACGCTATTTACTTTATATAT
TTATATCTCATAACCACACTTCCGCTCCACCAACCCATCAACCACACACCCCCAGGGACTCTCC
CAACATACACCACCACAAACCCGATAAACGGCACTTCTTACTAACCAGTCCCTGATGCACTACC
GGTTGACTATAAGACCCATTACACCCCCCTTTCCTAGACCAATAGATTCACCCAACCCCATAAT
TCCTCACTCCGTCACCCCATCCATACGCTGCACTTAATAACTAGTTGCATTTATTCTATATCAA
CCACTTTGACGGCCCAATTAATTGTCCACAAAACTATAACAATCTTCCCCAACCAACAATATAC
AATATAGGTAACTAACATTTACTCATGTCCGACCTTTTACCGTTTTCAGACTCCCACTCTGTCT
CCAATTTCCATACATCAAACCTTCTCTTATTTATCAGTCGATCATCTATCAGCATTCTCTCAAG
CTGCTAAATCTATAGCATATTGGCCACATCAACCGTACTCCTTTTCGTCATCAATTTTTTCGCC
AACTACTGAGTCCAGACTCTCGATTAGCGCTGTCCATAGAAAATGGATTAATACACACCTATTA
CTTGCACGACTAACAATTCTTTCACACATTTTCACACTTTTCCTTACATATATGACTGCGCTCG
TCTCAAGAGCCTAAAAATAAACCACTAATGCAGGACTTTGAACCACCCCCCGGTCTCAACAATC
CGAATAAACCTCTCTGCAACCATTTAACAACATATACTAAGTCACGCTTTCACTCCCCAAACTC
AACGGCCACCCACCCAACAAGCATACCAGCCCAAACTGGAAAAACACCTTCCCTCGAACTTTAC
GCGTAAGCAACCCAGTCGATTTAACCAATCAATGATGAATCCCATCCAACATGCCGCTCTCTCT
CAATAAGCACACCATTTATTTCAGCAGCACCCCCTCCGCACCAGAGCTATGCCAGCCAAAAGCA
GCCCCCACTTTTTTGCCAGCCTTAAGGTGAAACTCACCACTTACACCACAGCATATCATTAGAC
TTGAGTACCCAGAAATACTTCCCAAATTCTATATCTATCCCGTATCTTACTTCCTCTTGTTATC
TTGGATTCCTCCCCCTTACTCTCACTCAACAATCAAAGCAACCTGCAATCACAGTGACATACCC
TCACTCCCAAACAGAGCAAACCCCTACCACACATCAGTAAACGCGGATACCCTAAGCTTTTAAT
AGACTAAAATTTAATTTGCAAGACTTAACAACCACAACATAGCCACTCCACAGAACCATCCTCC
ATATTTTTATCGATTTTCCCTACTTTGAACCTGTATCTGATATACCTATCTCCTAACACACACC
CCACATCAAGACAATATCGCACTAAATTATAACTCACCCCCCCATTCACCACGATCCTGCGCCC
CAAAAAAAGTACCAAATTTATCATTACCACCCCCATCAAATGTTATCTCCCCAAATGTCTCAAT
AATGGCATACATATATACACTAAATCCCCCCNTCCAGCGTCTGAATGCAGCCGGACCCATTACC
CTGTCTTCCTCATTTTAGCGACCTGCCCCTTACTAAGCATTTTAGCCCAATCATCTGTCCCCCG
AACCGCCCAATCAATTAGCCTTGACCCTAATACCCCAGAACTCCTCTTTAAATCATAAGTCCAC
AACCCCCCTACCGACTCGACTCGCCACTTTACTTGGCCGCACAAAATAGCAACTAACACTACCA
TTAATCACAAAATTCTCCAACCAAGCATCTATCACCTACACACTGTCACCCCAGCCGACTCCCC
TCAATACAAGTAATGGTATTATGGCCCCACTGTAATGTAATTAGTCTGTCCATAAATTTACACC
CTACCAAATACTGTAACCTCAATAGCTATGTATGGCCCATAACCTGTGCGCAGTTGAAATTCAG
ATCATCGTTCGAGACATTCCAAACACACCGAAGTTAATATAATTCGCGTCGTCCAAAATTATAT
AGAAGACTGAAACTACTTTAAAGTACCTAACACAATAGCTGCCTCAAAGATATCACACTTTAAC
TAATATGAGCCAACCAATAACTCCCGGATTGACATCCACATACCACTCAAGTATTCTGGCCTCC
ACCCAATAGCCCAACTATGCCATACCGATAATTGATTCAATATTACCAATCCTAATAG

In [77]:
lo, hi = bwt.find('CATCAT')
print lo, hi
6407 6429

In [81]:
for i in xrange(lo,hi):
    print bwt.suffix(i)[:60]
CATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAA
CATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACT
CATCATAATCGGAGGCTTTGGCAACTGACTAGTTCCCCTAATAATCGGTGCCCCCGATAT
CATCATACTCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAACAC
CATCATAGAATTCTCACTGTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAA
CATCATAGCAGGCAGTTGAGGTGGATTAAACCAAACCCAGCTACGCAAAATCTTAGCATA
CATCATAGCCACCATCACCCTCCTTAACCTCTACTTCTACCTACGCCTAATCTACTCCAC
CATCATAGGAGGCTTCATTCACTGATTTCCCCTATTCTCAGGCTACACCCTAGACCAAAC
CATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCC
CATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAAC
CATCATCCCCACCATCATAGCCACCATCACCCTCCTTAACCTCTACTTCTACCTACGCCT
CATCATCCCTCTACTATTTTTTAACCAAATCAACAACAACCTATTTAGCTGTTCCCCAAC
CATCATCCTAGCCCTAAGTCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAACCGA
CATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACGAGGT
CATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCAT
CATCATGACCCTTGGCCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCC
CATCATTAATAATCATAATAGCTATAGCAATAAAACTAGGAATAGCCCCCTTTCACTTCT
CATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATC
CATCATTATTCTCGCACGGACTACAACCACGACCAATGATATGAAAAACCATCGTTGTAT
CATCATTCCCCCTAAATAAATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATT
CATCATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCCTTATCAC
CATCATTGGACAAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAAC

In []: