|
1
'PIT', 'PTI', 'ITP', 'IPT', 'TPI', and 'TIP' all weigh the same
'PIT' breaks into 'PI' and 'IT', while
'PTI' breaks into 'PT' and 'TI'
3
Example:
Molecular weight of Glycine Amino Acid
$$W(C_2 H_5 N O_2) = 12 \times 2 + 5 \times 1 + 14 + 16 \times 2 = 75$$
Molecular wieght of Glycine Residue (Minus the $H_{2} O$ lost forming the peptide bond)
$$W(C_2 H_5 N O_2 - H_2 O) = 57$$
4
AminoAcid = {
'A': 'Alanine', 'C': 'Cysteine', 'D': 'Aspartic acid', 'E': 'Glutamic acid',
'F': 'Phenylalanine', 'G': 'Glycine', 'H': 'Histidine', 'I': 'Isoleucine',
'K': 'Lysine', 'L': 'Leucine', 'M': 'Methionine', 'N': 'Asparagine',
'P': 'Proline', 'Q': 'Glutamine', 'R': 'Arginine', 'S': 'Serine',
'T': 'Theronine', 'V': 'Valine', 'W': 'Tryptophan', 'Y': 'Tyrosine',
'*': 'STOP'
}
AminoAbbrv = {
'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 'E': 'Glu',
'F': 'Phe', 'G': 'Gly', 'H': 'His', 'I': 'Ile',
'K': 'Lys', 'L': 'Leu', 'M': 'Met', 'N': 'Asn',
'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 'S': 'Ser',
'T': 'Thr', 'V': 'Val', 'W': 'Trp', 'Y': 'Tyr',
'*': 'STP'
}
# Now it's time to use this dictionary!
Daltons = {
'A': 71, 'C': 103, 'D': 115, 'E': 129,
'F': 147, 'G': 57, 'H': 137, 'I': 113,
'K': 128, 'L': 113, 'M': 131, 'N': 114,
'P': 97, 'Q': 128, 'R': 156, 'S': 87,
'T': 101, 'V': 99, 'W': 186, 'Y': 163
}
5
6
TyrocidineB1 = "VKLFPWFNQY"
# The weight of Tyrocidine B1
print sum([Daltons[res] for res in TyrocidineB1])
7
8
def TheoreticalSpectrum(peptide):
# Generate every possible fragment of a peptide
spectrum = set()
for fragLength in xrange(1,len(peptide)+1):
for start in xrange(0,len(peptide)-fragLength+1):
seq = peptide[start:start+fragLength]
spectrum.add(sum([Daltons[res] for res in seq]))
return sorted(spectrum)
print TyrocidineB1
spectrum = TheoreticalSpectrum(TyrocidineB1)
print len(spectrum)
print spectrum
peptide = TyrocidineB1
fragList = []
for fragLength in xrange(1,len(peptide)+1):
for start in xrange(0,len(peptide)-fragLength+1):
seq = peptide[start:start+fragLength]
fragList.append((sum([Daltons[res] for res in seq]), seq))
print len(fragList)
lastWeight = 0
for weight, frag in sorted(fragList):
print "%10s: %d%s" % (frag, weight, "*" if (weight == lastWeight) else " ")
lastWeight = weight
spectrum = TheoreticalSpectrum('PLAY')
print len(spectrum)
print spectrum
9
10
Can you think of a Brute-Force way of solving this problem?
Here's one:
11
def PossiblePeptide(spectrum, prefix=''):
""" A brute force method of generating all peptide
sequences that add up to our target weight
from the given spectrum """
global peptideList
if (len(prefix) == 0):
peptideList = []
current = sum([Daltons[res] for res in prefix])
target = max(spectrum) # our target
if (current == target):
peptideList.append(prefix)
elif (current < target):
for residue in Daltons.iterkeys():
PossiblePeptide(spectrum, prefix+residue)
def TestPeptides(candidateList, target):
filteredList = []
for peptide in candidateList:
candidateSpectrum = TheoreticalSpectrum(peptide)
if (candidateSpectrum == target):
filteredList.append(peptide)
return filteredList
spectrum = TheoreticalSpectrum('PLAY')
%time PossiblePeptide(spectrum)
print len(peptideList), "candidates"
print "PLAY" in peptideList
%time matches = TestPeptides(peptideList, spectrum)
print matches
print "PLAY" in matches
12
13
The brute force method does not make good use of the spectrum it is given
We could only extend our prefix with residues that appear in our spectrum
Actual fragments:
P L A Y PL LA AY PLA LAY PLAY
Growing and Checking prefixes:
A I L P Y
AI = LA IA = LA LA = LA PI = PL YA = AY
AIP = PLA IAP = PLA LAP = PLA PIA = PLA YAI = LAY
AIPY = PLAY IAPY = PLAY LAPY = PLAY PIAY = PLAY YAIP = PLAY
AIY = LAY IAY = LAY LAY = LAY YAL = LAY
AIYP = PLAY IAYP = PLAY LAYP = PLAY YALP = PLAY
AL = LA IP = PL LP = PL PL = PL
ALP = PLA IPA = PLA LPA = PLA PLA = PLA
ALPY = PLAY IPAY = PLAY LPAY = PLAY PLAY = PLAY
ALY = LAY
ALYP = PLAY
AY = AY
AYI = LAY
AYIP = PLAY
AYL = LAY
AYLP = PLAY
14
def ImprovedPossiblePeptide(spectrum, prefix=''):
global peptideList
if (len(prefix) == 0):
peptideList = []
current = sum([Daltons[res] for res in prefix])
target = max(spectrum)
if (current == target):
peptideList.append(prefix)
elif (current < target):
for residue in Daltons.iterkeys():
if (Daltons[residue] not in spectrum):
continue
extend = prefix + residue
if (sum([Daltons[res] for res in extend]) not in spectrum):
continue
ImprovedPossiblePeptide(spectrum, extend)
spectrum = TheoreticalSpectrum('PLAY')
%time ImprovedPossiblePeptide(spectrum)
print len(peptideList)
print "PLAY" in peptideList
%time matches = TestPeptides(peptideList, spectrum)
print matches
print "PLAY" in matches
for peptide in peptideList:
print peptide
TheoreticalSpectrum('PLAY')
TheoreticalSpectrum('LAPY')
print sum([Daltons[res] for res in 'AP']) # Suffix of 'LAP' prefix
print sum([Daltons[res] for res in 'APY']) # Suffix of 'LAPY'
print sum([Daltons[res] for res in 'PY']) # Suffix of 'LAPY'
15
def UltimatePossiblePeptide(spectrum, prefix=''):
global peptideList
if (len(prefix) == 0):
peptideList = []
current = sum([Daltons[res] for res in prefix])
target = max(spectrum)
if (current == target):
peptideList.append(prefix)
elif (current < target):
for residue in Daltons.iterkeys():
extend = prefix + residue
suffix = [extend[i:] for i in xrange(len(extend))]
for fragment in suffix:
if (sum([Daltons[res] for res in fragment]) not in spectrum):
break
else:
UltimatePossiblePeptide(spectrum, extend)
spectrum = TheoreticalSpectrum('PLAY')
%time UltimatePossiblePeptide(spectrum)
print len(peptideList)
print "PLAY" in peptideList
%time matches = TestPeptides(peptideList, spectrum)
print matches
print "PLAY" in matches
16
spectrum = TheoreticalSpectrum(TyrocidineB1)
%time UltimatePossiblePeptide(spectrum)
print len(peptideList)
print TyrocidineB1 in peptideList
%time matches = TestPeptides(peptideList, spectrum)
print len(matches)
print TyrocidineB1 in matches
for peptide in peptideList:
print peptide
All of these peptides give also give us our desired spectrum
17
18
97, 99, 113, 114, 128, 147, 163, 186, 200, 227, 241, 242, 244, 260, 261, 283, 291, 333, 340, 357, 388, 389, 405, 430, 447, 457, 485, 487, 543, 544, 552, 575, 577, 584, 659, 671, 672, 690, 691, 731, 738, 770, 804, 818, 819, 835, 906, 917, 932, 982, 1031, 1060, 1095, 1159, 1223, 1322
False Masses: present in the experimental spectrum, but not in the theoretical spectrum
Missing Masses: present in the theoretical spectrum, but not in the experimental spectrum
19
97, 99, 113, 128, 147, 163, 186, 200, 227, 241, 242, 244, 260, 261, 283, 291, 333, 340, 357, 405, 430, 447, 457, 487, 543, 544, 552, 575, 577, 584, 659, 671, 672, 690, 691, 731, 738, 770, 804, 818, 819, 835, 906, 917, 932, 982, 1031, 1095, 1159, 1322
False Masses: We don't know which these are
Missing Masses: And these values don't appear
20
# generate a synthetic experimental spectrum with 10% Error
import random
random.seed(1961)
spectrum = TheoreticalSpectrum(TyrocidineB1)
# Pick around ~10% at random to remove
missingMass = random.sample(spectrum, 6)
print "Missing Masses = ", missingMass
# Add back another ~10% of false, but actual, peptide masses
falseMass = []
for i in xrange(5):
fragment = ''.join(random.sample(Daltons.keys(), random.randint(2,len(TyrocidineB1)-2)))
weight = sum([Daltons[residue] for residue in fragment])
falseMass.append(weight)
print "False Masses = ", falseMass
experimentalSpectrum = sorted(set([mass for mass in spectrum if mass not in missingMass] + falseMass))
print experimentalSpectrum
21
22
import itertools
def LeaderboardFindPeptide(noisySpectrum, cutThreshold=0.05):
# Golf Tournament Heuristic
spectrum = set(noisySpectrum)
target = max(noisySpectrum)
players = [''.join(peptide) for peptide in itertools.product(Daltons.keys(), repeat=2)]
round = 1
currentLeader = [0.0, '']
while True:
print "%8d Players in round %d" % (len(players), round)
leaderboard = []
for prefix in players:
testSpectrum = set(TheoreticalSpectrum(prefix))
totalWeight = max(testSpectrum)
score = len(spectrum & testSpectrum)/float(len(testSpectrum))
if (totalWeight == target):
if (score > currentLeader[0]):
currentLeader = [score, prefix]
elif (score == currentLeader[0]):
currentLeader += [prefix]
elif (totalWeight < target):
leaderboard.append((score, prefix))
remaining = len(leaderboard)
if (remaining == 0):
print "Done, no sequences can be extended"
break
leaderboard.sort(reverse=True)
# Prune the larger of the top 5% or the top 5 players
cut = leaderboard[max(min(5,remaining-1),int(remaining*cutThreshold))][0]
players = [p+r for s, p in leaderboard if s >= cut for r in Daltons.iterkeys()]
round += 1
return currentLeader
spectrum = TheoreticalSpectrum(TyrocidineB1)
experimentalSpectrum = [mass for mass in spectrum if mass not in missingMass] + falseMass
%time winners = LeaderboardFindPeptide(experimentalSpectrum)
print winners
print len(winners) - 1, "Candidate residues with", winners[0], 'matches'
print TyrocidineB1
print TyrocidineB1 in winners
23
24