The following functions are from last time:

In [1]:
def GetAlleles(filename):
    """ Read the alleles in filename """ 
    allele = dict()
    with open(filename, 'rb') as tsvfile:
        for row in tsvfile:
            line = row[:row.find('#')].strip(' \r\n')   # strip comments and line endings
            if (len(line) == 0):                        # skip empty lines
                continue
            field = line.split('\t')
            if (len(field) != 4):                       # print error for lines missing fields
                print "ERROR: %s" % str(field)
                continue
            allele[field[0]] = field[3]
    return allele

def GetCoordinates(filename):
    """ Read the chromosome and positions of alleles in filename """ 
    chrPos = dict()
    with open(filename, 'rb') as tsvfile:
        for row in tsvfile:
            line = row[:row.find('#')].strip(' \r\n')   # strip comments and line endings
            if (len(line) == 0):                        # skip empty lines
                continue
            field = line.split('\t')
            if (len(field) != 4):                       # print error for lines missing fields
                print "ERROR: %s" % str(field)
                continue
            chrPos[field[0]] = (field[1], int(field[2]))
    return chrPos

This is the phasing routine from last time.

In [65]:
alleles = dict()
for sample in "ABCDEFGHIQ":
    alleles[sample] = GetAlleles("genome%s.txt" % sample)
coordinates = GetCoordinates("genomeG.txt")

The following code sorts the alleles by their genomic position

In [66]:
chromo = [str(i) for i in xrange(1,23)] + ['X', 'Y', 'MT']

def compareCoords(a1, a2):
    a1Chr, a1Pos = coordinates[a1]
    a2Chr, a2Pos = coordinates[a2]
    a1Index = chromo.index(a1Chr)
    a2Index = chromo.index(a2Chr)
    if (a1Index == a2Index):
        return a1Pos - a2Pos
    else:
        return a1Index - a2Index

orderedAlleles = sorted(coordinates.keys(), cmp=compareCoords)

A first attempt at a function to compute the relatedness of two samples

In [95]:
related = {
    ('A', 'A'): 1,   ('A', 'AA'): 1,   ('AA', 'A'): 1,   ('AA', 'AA'): 2,
    ('C', 'C'): 1,   ('C', 'CC'): 1,   ('CC', 'C'): 1,   ('CC', 'CC'): 2,
    ('G', 'G'): 1,   ('G', 'GG'): 1,   ('GG', 'G'): 1,   ('GG', 'GG'): 2,
    ('T', 'T'): 1,   ('T', 'TT'): 1,   ('TT', 'T'): 1,   ('TT', 'TT'): 2,
    ('I', 'I'): 1,   ('I', 'II'): 1,   ('II', 'I'): 1,   ('II', 'II'): 2,
    ('D', 'D'): 1,   ('D', 'DD'): 1,   ('DD', 'D'): 1,   ('DD', 'DD'): 2,
    ('AA', 'AC'): 1, ('AA', 'AG'): 1,  ('AA', 'AT'): 1,
    ('AC', 'AA'): 1, ('AC', 'AC'): 2,  ('AC', 'AG'): 1,  ('AC', 'AT'): 1,
    ('AC', 'CC'): 1, ('AC', 'CG'): 1,  ('AC', 'CT'): 1,
    ('AG', 'AA'): 1, ('AG', 'AC'): 1,  ('AG', 'AG'): 2,  ('AG', 'AT'): 1,
    ('AG', 'CG'): 1, ('AG', 'GG'): 1,  ('AG', 'GT'): 1,
    ('AT', 'AA'): 1, ('AT', 'AC'): 1,  ('AT', 'AG'): 1,  ('AT', 'AT'): 2,
    ('AT', 'CT'): 1, ('AT', 'GT'): 1,  ('AT', 'TT'): 1,
    ('CC', 'AC'): 1, ('CC', 'CG'): 1,  ('CC', 'CT'): 1,
    ('CG', 'AC'): 1, ('CG', 'CC'): 1,  ('CG', 'CG'): 2,  ('CG', 'CT'): 1,  
    ('CG', 'AG'): 1, ('CG', 'GG'): 1,  ('CG', 'GT'): 1,
    ('CT', 'CC'): 1, ('CT', 'CG'): 1,  ('CT', 'CT'): 2,
    ('CT', 'AT'): 1, ('CT', 'GT'): 1,  ('CT', 'TT'): 1,
    ('GG', 'AG'): 1, ('GG', 'CG'): 1,  ('GG', 'GT'): 1,
    ('GT', 'AT'): 1, ('GT', 'CT'): 1,  ('GT', 'GT'): 2,  ('GT', 'TT'): 1,
    ('GT', 'AG'): 1, ('GT', 'CG'): 1,  ('GT', 'GG'): 1,
    ('TT', 'AT'): 1, ('TT', 'CT'): 1,  ('TT', 'GT'): 1,
    ('DD', 'DI'): 1, ('DI', 'DD'): 1,  ('DI', 'DI'): 2, 
    ('DI', 'II'): 1, ('II', 'DI'): 1,
    ('AC', 'A'): 1,  ('AC', 'C'): 1,   ('AG', 'A'): 1,   ('AG', 'G'): 1,
    ('AT', 'A'): 1,  ('AT', 'T'): 1,   ('CG', 'C'): 1,   ('CG', 'G'): 1,
    ('CT', 'C'): 1,  ('CT', 'T'): 1,   ('GT', 'G'): 1,   ('GT', 'T'): 1,
    ('A', 'AC'): 1,  ('C', 'AC'): 1,   ('A', 'AG'): 1,   ('G', 'AG'): 1,
    ('A', 'AT'): 1,  ('T', 'AT'): 1,   ('C', 'CG'): 1,   ('G', 'CG'): 1,
    ('C', 'CT'): 1,  ('T', 'CT'): 1,   ('G', 'GT'): 1,   ('T', 'GT'): 1,
}

def relatedness(alleles1, alleles2):
    N, R, B = 0, 0, 0                     # counts
    for gId in orderedAlleles:
        try:
            call = alleles1[gId]
            other = alleles2[gId]
        except:
            continue
        try:
            R += related[call, other]
        except KeyError:
            if ('-' not in call) and ('-' not in other):
                B += 1
            continue
        N += min(len(call), len(other))
    return R, N, B


relResults = {}
sample = "ABCDEFGHIQ"
for i in xrange(len(sample)-1):
    for j in xrange(i+1, len(sample)):
        print sample[i]+sample[j],
        R, N, B = relatedness(alleles[sample[i]], alleles[sample[j]])
        print "%8d %8d %6d %5.2f" % (R, N, B, (100.0*R)/N)
        relResults[sample[i]+sample[j]] = B
AB  1440340  1753369  46104 82.15
AC  1437153  1748784  43516 82.18
AD  1440679  1753122  44432 82.18
AE  1396853  1723139  58554 81.06
AF  1394962  1720781  55923 81.07
AG  1419778  1739305  51091 81.63
AH  1411046  1735320  51617 81.31
AI  1407858  1733651  54341 81.21
AQ  1393975  1723768  60075 80.87
BC  1619500  1843704  14808 87.84
BD  1612492  1848211  15808 87.25
BE  1424888  1759300  60163 80.99
BF  1423335  1757179  57076 81.00
BG  1499415  1805653  37429 83.04
BH  1463508  1786795  45499 81.91
BI  1472157  1789596  46028 82.26
BQ  1423225  1761164  60886 80.81
CD  1639542  1865794  15570 87.87
CE  1423111  1755496  57046 81.07
CF  1439527  1779341  54542 80.90
CG  1508366  1812481  27355 83.22
CH  1493198  1815573  39020 82.24
CI  1476242  1791244  40101 82.41
CQ  1422634  1757438  57865 80.95
DE  1428927  1762110  57692 81.09
DF  1440174  1780996  56050 80.86
DG  1601877  1875349    136 85.42
DH  1538494  1846692  26698 83.31
DI  1531823  1830369  24692 83.69
DQ  1424296  1761707  58680 80.85
EF  1421466  1753423  56341 81.07
EG  1591589  1863848   7154 85.39
EH  1506036  1808533  32075 83.27
EI  1494345  1802859  37624 82.89
EQ  1419472  1755040  61571 80.88
FG  1420065  1753436  56957 80.99
FH  1609977  1890083    103 85.18
FI  1592282  1864363    325 85.41
FQ  1418002  1752177  58905 80.93
GH  1594680  1869911    332 85.28
GI  1592863  1865965   6855 85.36
GQ  1420744  1756624  61177 80.88
HI  1605075  1837363  17632 87.36
HQ  1423349  1759735  58041 80.88
IQ  1420322  1756863  61496 80.84

In [76]:
expResult = { 
    'BC': 0.5, 'BD': 0.5, 'CD': 0.5, 'HI': 0.5,     # siblings
    'DG': 0.5, 'EG': 0.5,                           # parents
    'FH': 0.5, 'FI': 0.5, 'GH': 0.5, 'GI': 0.5,     # parents
    'DH': 0.25,'DI': 0.25,'EH': 0.25,'EI': 0.25,    # grandparents
    'AB': 0.125, 'AC': 0.125, 'AD': 0.125,          # 1st cousin once removed
    'AG': 0.0625,                                   # 1st cousin twice removed
    'AH': 0.03125, 'AI': 0.03125                    # 1st cousin thrice removed
    }
In [77]:
%matplotlib inline
import matplotlib.pyplot as plot
import numpyx = [relResults[key] for key in expResult.iterkeys()]
In [96]:
x = [relResults[key] for key in expResult.iterkeys()]
y = [expResult[key] for key in expResult.iterkeys()]

plot.scatter(x, y)

A = numpy.vstack([x, numpy.ones(len(x))]).T
m, c = numpy.linalg.lstsq(A,y)[0]

t = numpy.arange(0, 60000.0, 2500.0)
plot.plot(t, m*t+c, 'r:')
Out[96]:
[<matplotlib.lines.Line2D at 0x16f8de890>]
In [94]:
print m, c
for key, value in sorted(relResults.iteritems()):
    print key, 100*(m*value + c)
-9.4422235859e-06 0.556765515361
AB 12.1441239157
AC 14.5877713797
AD 13.7228636993
AE 0.388555551254
AF 2.8728045767
AG 7.43528701341
AH 6.93862605279
AI 4.36656434799
AQ -1.04760665616
BC 41.6945068501
BD 40.7502844915
BE -1.13069822372
BF 1.78411619725
BG 20.3352528765
BH 12.7153784426
BI 12.2158848149
BQ -1.81337098898
CD 40.9750094129
CE 1.81244286801
CF 4.17677565392
CG 29.8473489169
CH 18.8329951039
CI 17.8122907343
CQ 1.03912475632
DE 1.20247522436
DF 2.75288833716
DG 55.5481372954
DH 30.4677030065
DI 32.3618130578
DQ 0.269583534071
EF 2.47811963081
EG 48.9215847828
EH 25.3906193844
EI 20.1511295165
EQ -2.46016330461
FG 1.89647865792
FH 55.5792966332
FI 55.3696792696
FQ 0.0571335033887
GH 55.3630697131
GI 49.203907268
GQ -2.08813969533
HI 39.0280229095
HQ 0.87294162121
IQ -2.38934662772

A better model might be one of the form: R = 1/(k x + 1)

which can be rewritten as:

 R*x k + R = 1
 
 R*x k = 1 - R
In [106]:
A = [relResults[key]*expResult[key] for key in expResult.iterkeys()]
y2 = [1.0 - expResult[key] for key in expResult.iterkeys()]

A = numpy.vstack([A]).T
k = numpy.linalg.lstsq(A,y)[0][0]
print m

plot.scatter(x, y)
plot.plot(t, 1.0/(k*t + 1), 'r:')
plot.plot(t, 1.0/(2.0*k*t + 1), 'g:')
plot.plot(t, 1.0/(0.5*k*t + 1), 'b:')
4.74356594753e-05

Out[106]:
[<matplotlib.lines.Line2D at 0x16ea74e50>]

Did I do something wrong?

In []: