Note: Our Python/Jupyter study session will be held in SN014 on 1/28 from 5:00pm to 6:30pm
1
1 tagtggtcttttgagtgTAGATCTGAAgggaaagtatttccaccagttcggggtcacccagcagggcagggtgacttaat
2 cgcgactcggcgctcacagttatcgcacgtttagaccaaaacggagtTGGATCCGAAactggagtttaatcggagtcctt
3 gttacttgtgagcctggtTAGACCCGAAatataattgttggctgcatagcggagctgacatacgagtaggggaaatgcgt
4 aacatcaggctttgattaaacaatttaagcacgTAAATCCGAAttgacctgatgacaatacggaacatgccggctccggg
5 accaccggataggctgcttatTAGGTCCAAAaggtagtatcgtaataatggctcagccatgtcaatgtgcggcattccac
6 TAGATTCGAAtcgatcgtgtttctccctctgtgggttaacgaggggtccgaccttgctcgcatgtgccgaacttgtaccc
7 gaaatggttcggtgcgatatcaggccgttctcttaacttggcggtgCAGATCCGAAcgtctctggaggggtcgtgcgcta
8 atgtatactagacattctaacgctcgcttattggcggagaccatttgctccactacaagaggctactgtgTAGATCCGTA
9 ttcttacacccttcttTAGATCCAAAcctgttggcgccatcttcttttcgagtccttgtacctccatttgctctgatgac
10 ctacctatgtaaaacaacatctactaacgtagtccggtctttcctgatctgccctaacctacaggTCGATCCGAAattcg
2
def Score(s, DNA, k):
"""
compute the consensus SCORE of a given k-mer
alignment given offsets into each DNA string.
s = list of starting indices, 1-based, 0 means ignore
DNA = list of nucleotide strings
k = Target Motif length
"""
score = 0
for i in xrange(k):
# loop over string positions
cnt = dict(zip("acgt",(0,0,0,0)))
for j, sval in enumerate(s):
# loop over DNA strands
base = DNA[j][sval+i]
cnt[base] += 1
score += max(cnt.itervalues())
return score
3
seqApprox = [
'tagtggtcttttgagtgtagatctgaagggaaagtatttccaccagttcggggtcacccagcagggcagggtgacttaat',
'cgcgactcggcgctcacagttatcgcacgtttagaccaaaacggagttggatccgaaactggagtttaatcggagtcctt',
'gttacttgtgagcctggttagacccgaaatataattgttggctgcatagcggagctgacatacgagtaggggaaatgcgt',
'aacatcaggctttgattaaacaatttaagcacgtaaatccgaattgacctgatgacaatacggaacatgccggctccggg',
'accaccggataggctgcttattaggtccaaaaggtagtatcgtaataatggctcagccatgtcaatgtgcggcattccac',
'tagattcgaatcgatcgtgtttctccctctgtgggttaacgaggggtccgaccttgctcgcatgtgccgaacttgtaccc',
'gaaatggttcggtgcgatatcaggccgttctcttaacttggcggtgcagatccgaacgtctctggaggggtcgtgcgcta',
'atgtatactagacattctaacgctcgcttattggcggagaccatttgctccactacaagaggctactgtgtagatccgta',
'ttcttacacccttctttagatccaaacctgttggcgccatcttcttttcgagtccttgtacctccatttgctctgatgac',
'ctacctatgtaaaacaacatctactaacgtagtccggtctttcctgatctgccctaacctacaggtcgatccgaaattcg']
print Score([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], seqApprox, 10)
%timeit Score([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], seqApprox, 10)
So even at a blazing $50.6 \mu s$ we'll need many lifetimes to compute the $70^{10}$ scores
4
Consider any permutation of offsets that begins with the indices [25, 63, 10, 43, ....]. Just based on the first 4 indices the largest possible score is 17 + 6*10 = 87, which assumes that all 6 remaining strings match perfectly at all 10 positions.
DNA[0][25:35] a a g g g a a a g t DNA[1][63:73] g t t t a a t c g g DNA[2][10:20] a g c c t g g t t a DNA[3][43:53] t t g a c c t g a t a [2, 1, 0, 1, 1, 2, 1, 1, 1, 1] Profile c [0, 0, 1, 1, 1, 1, 0, 1, 0, 0] g [1, 1, 2, 1, 1, 1, 1, 1, 2, 1] t [1, 2, 1, 1, 1, 0, 2, 1, 1, 2] [2, 2, 2, 1, 1, 2, 2, 1, 2, 2] Score = 17
If the best answer so far is 89, there is no need to consider the 706 offset permuations that start with these 4 indices.
5
6
5
bestAlignment = []
prunedPaths = 0
def exploreMotifs(DNA,k,path,bestScore):
""" Recursively finds a k-length motif in the
list of DNA sequences by exploring all paths
in a search tree. Each recursive call extends
path by one index, once the path reaches the
number of DNA strings a score is computed. """
global bestAlignment, prunedPaths
depth = len(path)
t = len(DNA)
if (depth == t):
s = Score(path,DNA,k)
if (s > bestScore):
bestAlignment = [p for p in path]
return s
else:
return bestScore
else:
# Before going down this path further let's
# consider if an optimistic best score can
# possibly beat the best score so far
if (depth > 1):
OptimisticScore = k*(t-depth) + Score(path,DNA,k)
else:
OptimisticScore = k*t
if (OptimisticScore < bestScore):
prunedPaths = prunedPaths + 1
return bestScore
else:
for s in xrange(len(DNA[depth])-k+1):
newPath = tuple([i for i in path] + [s])
bestScore = exploreMotifs(DNA,k,newPath,bestScore)
return bestScore
def BranchAndBoundMotifSearch(DNA, k):
""" Finds a k-length motif within a list of DNA sequences"""
global bestAlignment, prunedPaths
bestAlignment = []
prunedPaths = 0
bestScore = 0
for s in xrange(len(DNA[0])-k+1):
bestScore = exploreMotifs(DNA,k,[s],bestScore)
print bestAlignment, bestScore, prunedPaths
%time BranchAndBoundMotifSearch(seqApprox[0:6], 10)
7
8
9
def ScanAndScoreMotif(DNA, motif):
totalDist = 0
bestAlignment = []
k = len(motif)
for seq in DNA:
minHammingDist = k+1
for s in xrange(len(seq)-k+1):
HammingDist = sum([1 for i in xrange(k) if motif[i] != seq[s+i]])
if (HammingDist < minHammingDist):
bestS = s
minHammingDist = HammingDist
bestAlignment.append(bestS)
totalDist += minHammingDist
return bestAlignment, totalDist
print ScanAndScoreMotif(seqApprox, "tagatccgaa")
%timeit ScanAndScoreMotif(seqApprox, "tagatccgaa")
Wow, we can test over 500 motifs per second!
10
11
import itertools
def MedianStringMotifSearch(DNA,k):
""" Consider all possible 4**k motifs"""
bestAlignment = []
minHammingDist = k*len(DNA)
kmer = ''
for pattern in itertools.product('acgt', repeat=k):
motif = ''.join(pattern)
align, dist = ScanAndScoreMotif(DNA, motif)
if (dist < minHammingDist):
bestAlignment = [s for s in align]
minHammingDist = dist
kmer = motif
return bestAlignment, minHammingDist, kmer
%time MedianStringMotifSearch(seqApprox,10)
Should we declare victory and move on? Do you find anything uncomfortable about an algorithm that requires $O(t N k 4^k)$ steps?
12
HammingDist = sum([1 for i in xrange(k) if motif[i] != seq[s+i]])we had counted matches
Matches = sum([1 for i in xrange(k) if motif[i] == seq[s+i]])and found the maximum(TotalMatches) instead of the min(TotalHammingDistance) we would be using the same measure as Score().
13
14
def ContainedMotifSearch(DNA,k):
""" Consider only motifs from the given DNA sequences"""
motifSet = set()
for seq in DNA:
for i in xrange(len(seq)-k+1):
motifSet.add(seq[i:i+k])
print "%d Motifs in our set" % len(motifSet)
bestAlignment = []
minHammingDist = k*len(DNA)
kmer = ''
for motif in motifSet:
align, dist = ScanAndScoreMotif(DNA, motif)
if (dist < minHammingDist):
bestAlignment = [s for s in align]
minHammingDist = dist
kmer = motif
return bestAlignment, minHammingDist, kmer
%time ContainedMotifSearch(seqApprox,10)
Not exactly the motif we were looking for (off by a 'g'),
[17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa'but boy was it fast! Where's a good Oracle when you need one?
16
If we call Score([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], seqApprox, 10)
DNA[0][17:27] t a g a t c t g a a DNA[1][31:41] t a g a c c a a a a DNA[2][18:28] t a g a c c c g a a DNA[3][33:43] t a a a t c c g a a DNA[4][21:31] t a g g t c c a a a DNA[5][ 0:10] t a g a t t c g a a DNA[6][46:56] c a g a t c c g a a DNA[7][70:80] t a g a t c c g t a DNA[8][16:26] t a g a t c c a a a DNA[9][65:75] t c g a t c c g a a a [0, 9, 1, 9, 0, 0, 1, 3, 9,10] c [1, 1, 0, 0, 2, 9, 8, 0, 0, 0] g [0, 0, 9, 1, 0, 0, 0, 7, 0, 0] t [9, 0, 0, 0, 8, 1, 1, 0, 1, 0] [9, 9, 9, 9, 8, 9, 8, 7, 9,10] Score = 87 Consensus t a g a t c c g a a Our motif!
Any Ideas?
17
def Consensus(s, DNA, k):
"""
compute the consensus k-Motif of an alignment
given offsets into each DNA string.
s = list of starting indices, 1-based, 0 means ignore
DNA = list of nucleotide strings
k = Target Motif length
"""
consensus = ''
for i in xrange(k):
# loop over string positions
cnt = dict(zip("acgt",(0,0,0,0)))
for j, sval in enumerate(s):
# loop over DNA strands
base = DNA[j][sval+i]
cnt[base] += 1
consensus += max(cnt.iteritems(), key=lambda tup: tup[1])[0]
return consensus
def ContainedConsensusMotifSearch(DNA,k):
bestAlignment, minHammingDist, kmer = ContainedMotifSearch(DNA,k)
motif = Consensus(bestAlignment,DNA,k)
newAlignment, HammingDist = ScanAndScoreMotif(DNA, motif)
return newAlignment, HammingDist, motif
%time ContainedConsensusMotifSearch(seqApprox,10)
Now we're cooking!
18
We got the answer that we were looking for, but
19
20
import random
def RandomizedMotifSearch(DNA,k):
""" Searches for a k-length motif that appears
in all given DNA sequences. It begins with a
random set of candidate consensus motifs
derived from the data. It refines the motif
until a true consensus emerges."""
# Seed motifs from random alignments
motifSet = set()
for i in xrange(500):
randomAlignment = [random.randint(0,len(DNA[j])-k) for j in xrange(len(DNA))]
motif = Consensus(randomAlignment, DNA, k)
motifSet.add(motif)
bestAlignment = []
minHammingDist = k*len(DNA)
kmer = ''
testSet = motifSet.copy()
while (len(testSet) > 0):
print len(motifSet),
nextSet = set()
for motif in testSet:
align, dist = ScanAndScoreMotif(DNA, motif)
# add new motifs based on these alignments
newMotif = Consensus(align, DNA, k)
if (newMotif not in motifSet):
nextSet.add(newMotif)
if (dist < minHammingDist):
bestAlignment = [s for s in align]
minHammingDist = dist
kmer = motif
testSet = nextSet.copy()
motifSet = motifSet | nextSet
return bestAlignment, minHammingDist, kmer
%time RandomizedMotifSearch(seqApprox,10)
Randomized algorithms should be restarted multiple times to insure a stable solution.
for i in xrange(10):
print RandomizedMotifSearch(seqApprox,10)
21
22