From last time.

In [2]:
import os
import pysam

%matplotlib inline
import matplotlib.pyplot as plot
import numpy

Read the BAM file and its index

In [3]:
bamfile = pysam.Samfile("FF0683F.bam", "rb")
if not os.path.exists("FF0683F.bam.bai"):
    pysam.index("FF0683F.bam")

Next let's consider a gene with multiple exons, Zfp324 (Chr7, 13551187 - 13559585, +). There are relatively few reads covering this gene (<1000). I have also made an effort to highlight the subset of reads that include splices.

In [36]:
channel = [0 for i in xrange(50)]
plot.figure(figsize=(15, 5))
N = 0
S = 0
for read in bamfile.fetch('chr7', 13551187, 13559585):
    if (read.aend - read.pos > len(read.seq)):
        # look for an availble channel to plot read
        for i, end in enumerate(channel):
            if (read.pos > end+100):
                channel[i] = read.aend-1
                y = i
                break
        else:
            # add a new channel if none is available
            channel.append(read.aend-1)
            y = len(channel)
        plot.plot([read.pos, read.aend-1], [y,y], 'dg')
        plot.plot([read.pos, read.aend-1], [y,y], ':g')
        S += 1
    N += 1
    
print "%d reads, %d with splices" % (N, S)

x = []
y = []

for column in bamfile.pileup('chr7', 13551187, 13559585):
    x.append(column.pos)
    n = 0
    for read in column.pileups:
        if (not read.is_del):
            n += 1
    y.append(n)

plot.plot(x, y, 'b')
plot.plot([x[0], x[-1]], [numpy.mean(y), numpy.mean(y)], ':r')
863 reads, 72 with splices

Out[36]:
[<matplotlib.lines.Line2D at 0x121623350>]
In []:
Next we consider a highly expressed gene, Peg3 (Chr7, 6656603 - 6683132, +)
In [40]:
channel = [0 for i in xrange(50)]
plot.figure(figsize=(15, 10))
N = 0
S = 0
for read in bamfile.fetch('chr7', 6656603, 6683132):
    if (read.aend - read.pos > len(read.seq)):
        # look for an availble channel to plot read
        for i, end in enumerate(channel):
            if (read.pos > end+100):
                channel[i] = read.aend-1
                y = i
                break
        else:
            # add a new channel if none is available
            channel.append(read.aend-1)
            y = len(channel)
        plot.plot([read.pos, read.aend-1], [y,y], 'dg')
        plot.plot([read.pos, read.aend-1], [y,y], ':g')
        S += 1
    N += 1
    
print "%d reads, %d with splices" % (N, S)

x = []
y = []

for column in bamfile.pileup('chr7', 6656603, 6683132):
    x.append(column.pos)
    n = 0
    for read in column.pileups:
        if (not read.is_del):
            n += 1
    y.append(n)

plot.plot(x, y, 'b')
plot.plot([x[0], x[-1]], [numpy.mean(y), numpy.mean(y)], ':r')
40758 reads, 4928 with splices

Out[40]:
[<matplotlib.lines.Line2D at 0x128abf550>]
In []: