import gzip, os, sys
from IPython.display import HTML
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
import scipy as sp
from collections import defaultdict
#import custom functions for displaying tables, bash commands
sys.path.append(os.path.abspath("/home/damian/"))
from dk_ipython import *
%matplotlib inline
HTML(addToggle())
Raw data processing
K-mer analysis
Contig assembly
Scaffolding
CEGMA
prefix = '/home/share/projects/phaw_genome/draft_2.3/'
8 shotgun libraries (SGL) were made from a single male animal (sample 4). The 500bp insert libraries (short-sgl) are actually 646bp inserts. The 800bp (long-sgl) are actually 1100bp inserts.
A total of 10 files for P. Hawaiensis genome was downloaded from Anastasios's dropbox account:
Sample_4_AP_500bp_Lane1_1
Sample_4_AP_500bp_Lane1_2
Sample_4_AP_500bp_Lanes2-3_1
Sample_4_AP_500bp_Lanes2-3_2
Sample_4_AP_800bp_Lane1_1
Sample_4_AP_800bp_Lane1_2
Sample_4_AP_500bp_Lanes4-5_1
Sample_4_AP_500bp_Lanes4-5_2
Sample_4_AP_800bp_Lane2-3_1
Sample_4_AP_800bp_Lane2-3_2
Sample_4_AP_500bp_Lanes2-3, Sample_4_AP_500bp_Lanes4-5, Sample_4_AP_800bp_Lane2-3
consist of sequences from two lanes.
Illumina headers are in the format:
MachineID : Run ID : Flow cell ID : Lane ID : Tile ID : X Location : Y Location Pair
Ex. HWI-ST651:295:HAE2KADXX:1:1101:1157:2016
Sample_4_AP_500bp_Lanes2-3
contains two lanes:
@HWI-ST651:295:HAE2KADXX:1:
@HWI-ST651:295:HAE2KADXX:2:
From Anastasios' e-mail:
Sample_4_AP_6kb_Lanes1-2_1 (reads from 5-7kb MPL lanes 1 and 2 from one end)
Sample_4_AP_6kb_Lanes1-2_2 (reads from 5-7kb MPL lanes 1 and 2 from other end)
Sample_4_AP_6kb_Lanes3-4_1 (reads from 5-7kb MPL lanes 3 and 4 from one end)
Sample_4_AP_6kb_Lanes3-4_2 (reads from 5-7kb MPL lanes 3 and 4 from other end)
Sample_4_AP_8kb_Lanes1-2_1 (reads from 7-9kb MPL lanes 1 and 2 from one end)
Sample_4_AP_8kb_Lanes1-2_2 (reads from 7-9kb MPL lanes 1 and 2 from other end)
Sample_4_AP_8kb_Lanes3-4_1 (reads from 7-9kb MPL lanes 3 and 4 from one end)
Sample_4_AP_8kb_Lanes3-4_2 (reads from 7-9kb MPL lanes 3 and 4 from other end)
Sample_4_AP_9kb_Lanes1-2_1 (reads from 9-12kb MPL lanes 1 and 2 from one end)
Sample_4_AP_9kb_Lanes1-2_2 (reads from 9-12kb MPL lanes 1 and 2 from other end)
Sample_4_AP_9kb_Lanes3-4_1 (reads from 9-12kb MPL lanes 3 and 4 from one end)
Sample_4_AP_9kb_Lanes3-4_2 (reads from 9-12kb MPL lanes 3 and 4 from other end)
Sample_4_AP_20kb_Lanes1-2_1 (reads from ?kb MPL lanes 1 and 2 from one end)
Sample_4_AP_20kb_Lanes1-2_2 (reads from ?kb MPL lanes 1 and 2 from other end)
Sample_4_AP_20kb_Lanes3-4_1 (reads from ?kb MPL lanes 3 and 4 from one end)
Sample_4_AP_20kb_Lanes3-4_2 (reads from ?kb MPL lanes 3 and 4 from other end)
A total of 16 mate pair libraries for download:
Sample_4_AP_20kb_Lanes1-2_1.fastq.gz
Sample_4_AP_20kb_Lanes1-2_2.fastq.gz
Sample_4_AP_20kb_Lanes3-4_1.fastq.gz
Sample_4_AP_20kb_Lanes3-4_2.fastq.gz
Sample_4_AP_6kb_Lanes1-2_1.fastq.gz
Sample_4_AP_6kb_Lanes1-2_2.fastq.gz
Sample_4_AP_6kb_Lanes3-4_1.fastq.gz
Sample_4_AP_6kb_Lanes3-4_2.fastq.gz
Sample_4_AP_8kb_Lanes1-2_1.fastq.gz
Sample_4_AP_8kb_Lanes1-2_2.fastq.gz
Sample_4_AP_8kb_Lanes3-4_1.fastq.gz
Sample_4_AP_8kb_Lanes3-4_2.fastq.gz
Sample_4_AP_9kb_Lanes1-2_1.fastq.gz
Sample_4_AP_9kb_Lanes1-2_2.fastq.gz
Sample_4_AP_9kb_Lanes3-4_1.fastq.gz
Sample_4_AP_9kb_Lanes3-4_2.fastq.gz
Samples came from a single male animal (sample 4).
All MPL samples contain two libraries.
Files containing multiple lanes are splitted into individual files by lane. This was done by first finding the two library header names and then distributing the fastq entries according to the header names.
The files were renamed in the format of:
renameCmds = '''ln -s raw/pilot/Sample_4_AP_500bp_Lane1_1.fastq.gz AP_phaw_S4_shortsgl_01_A.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lane1_2.fastq.gz AP_phaw_S4_shortsgl_01_B.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lanes2-3_1_A.fastq.gz AP_phaw_S4_shortsgl_02_A.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lanes2-3_1_B.fastq.gz AP_phaw_S4_shortsgl_03_A.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lanes2-3_2_A.fastq.gz AP_phaw_S4_shortsgl_02_B.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lanes2-3_2_B.fastq.gz AP_phaw_S4_shortsgl_03_B.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lanes4-5_1_A.fastq.gz AP_phaw_S4_shortsgl_04_A.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lanes4-5_1_B.fastq.gz AP_phaw_S4_shortsgl_05_A.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lanes4-5_2_A.fastq.gz AP_phaw_S4_shortsgl_04_B.fastq.gz
ln -s raw/pilot/Sample_4_AP_500bp_Lanes4-5_2_B.fastq.gz AP_phaw_S4_shortsgl_05_B.fastq.gz
ln -s raw/Sample_4_AP_500bp_Lanes6-7_1_Asplit.fastq.gz AP_phaw_S4_shortsgl_06_A.fastq.gz
ln -s raw/Sample_4_AP_500bp_Lanes6-7_2_Asplit.fastq.gz AP_phaw_S4_shortsgl_06_B.fastq.gz
ln -s raw/Sample_4_AP_500bp_Lanes6-7_1_Bsplit.fastq.gz AP_phaw_S4_shortsgl_07_A.fastq.gz
ln -s raw/Sample_4_AP_500bp_Lanes6-7_2_Bsplit.fastq.gz AP_phaw_S4_shortsgl_07_B.fastq.gz
ln -s raw/Sample_4_AP_500bp_Lanes8-9_1_Asplit.fastq.gz AP_phaw_S4_shortsgl_08_A.fastq.gz
ln -s raw/Sample_4_AP_500bp_Lanes8-9_2_Asplit.fastq.gz AP_phaw_S4_shortsgl_08_B.fastq.gz
ln -s raw/Sample_4_AP_500bp_Lanes8-9_1_Bsplit.fastq.gz AP_phaw_S4_shortsgl_09_A.fastq.gz
ln -s raw/Sample_4_AP_500bp_Lanes8-9_2_Bsplit.fastq.gz AP_phaw_S4_shortsgl_09_B.fastq.gz
ln -s raw/pilot/Sample_4_AP_800bp_Lane1_1.fastq.gz AP_phaw_S4_longsgl_01_A.fastq.gz
ln -s raw/pilot/Sample_4_AP_800bp_Lane1_2.fastq.gz AP_phaw_S4_longsgl_01_B.fastq.gz
ln -s raw/pilot/Sample_4_AP_800bp_Lanes2-3_1_A.fastq.gz AP_phaw_S4_longsgl_02_A.fastq.gz
ln -s raw/pilot/Sample_4_AP_800bp_Lanes2-3_1_B.fastq.gz AP_phaw_S4_longsgl_03_A.fastq.gz
ln -s raw/pilot/Sample_4_AP_800bp_Lanes2-3_2_A.fastq.gz AP_phaw_S4_longsgl_02_B.fastq.gz
ln -s raw/pilot/Sample_4_AP_800bp_Lanes2-3_2_B.fastq.gz AP_phaw_S4_longsgl_03_B.fastq.gz
ln -s raw/Sample_4_AP_20kb_Lanes1-2_1_Asplit.fastq.gz AP_phaw_S4_20kbmpl_01_A.fastq.gz
ln -s raw/Sample_4_AP_20kb_Lanes1-2_1_Bsplit.fastq.gz AP_phaw_S4_20kbmpl_02_A.fastq.gz
ln -s raw/Sample_4_AP_20kb_Lanes1-2_2_Asplit.fastq.gz AP_phaw_S4_20kbmpl_01_B.fastq.gz
ln -s raw/Sample_4_AP_20kb_Lanes1-2_2_Bsplit.fastq.gz AP_phaw_S4_20kbmpl_02_B.fastq.gz
ln -s raw/Sample_4_AP_20kb_Lanes3-4_1_Asplit.fastq.gz AP_phaw_S4_20kbmpl_03_A.fastq.gz
ln -s raw/Sample_4_AP_20kb_Lanes3-4_1_Bsplit.fastq.gz AP_phaw_S4_20kbmpl_04_A.fastq.gz
ln -s raw/Sample_4_AP_20kb_Lanes3-4_2_Asplit.fastq.gz AP_phaw_S4_20kbmpl_03_B.fastq.gz
ln -s raw/Sample_4_AP_20kb_Lanes3-4_2_Bsplit.fastq.gz AP_phaw_S4_20kbmpl_04_B.fastq.gz
ln -s raw/Sample_4_AP_6kb_Lanes1-2_1_Asplit.fastq.gz AP_phaw_S4_6kbmpl_01_A.fastq.gz
ln -s raw/Sample_4_AP_6kb_Lanes1-2_1_Bsplit.fastq.gz AP_phaw_S4_6kbmpl_02_A.fastq.gz
ln -s raw/Sample_4_AP_6kb_Lanes1-2_2_Asplit.fastq.gz AP_phaw_S4_6kbmpl_01_B.fastq.gz
ln -s raw/Sample_4_AP_6kb_Lanes1-2_2_Bsplit.fastq.gz AP_phaw_S4_6kbmpl_02_B.fastq.gz
ln -s raw/Sample_4_AP_6kb_Lanes3-4_1_Asplit.fastq.gz AP_phaw_S4_6kbmpl_03_A.fastq.gz
ln -s raw/Sample_4_AP_6kb_Lanes3-4_1_Bsplit.fastq.gz AP_phaw_S4_6kbmpl_04_A.fastq.gz
ln -s raw/Sample_4_AP_6kb_Lanes3-4_2_Asplit.fastq.gz AP_phaw_S4_6kbmpl_03_B.fastq.gz
ln -s raw/Sample_4_AP_6kb_Lanes3-4_2_Bsplit.fastq.gz AP_phaw_S4_6kbmpl_04_B.fastq.gz
ln -s raw/Sample_4_AP_8kb_Lanes1-2_1_Asplit.fastq.gz AP_phaw_S4_8kbmpl_01_A.fastq.gz
ln -s raw/Sample_4_AP_8kb_Lanes1-2_1_Bsplit.fastq.gz AP_phaw_S4_8kbmpl_02_A.fastq.gz
ln -s raw/Sample_4_AP_8kb_Lanes1-2_2_Asplit.fastq.gz AP_phaw_S4_8kbmpl_01_B.fastq.gz
ln -s raw/Sample_4_AP_8kb_Lanes1-2_2_Bsplit.fastq.gz AP_phaw_S4_8kbmpl_02_B.fastq.gz
ln -s raw/Sample_4_AP_8kb_Lanes3-4_1_Asplit.fastq.gz AP_phaw_S4_8kbmpl_03_A.fastq.gz
ln -s raw/Sample_4_AP_8kb_Lanes3-4_1_Bsplit.fastq.gz AP_phaw_S4_8kbmpl_04_A.fastq.gz
ln -s raw/Sample_4_AP_8kb_Lanes3-4_2_Asplit.fastq.gz AP_phaw_S4_8kbmpl_03_B.fastq.gz
ln -s raw/Sample_4_AP_8kb_Lanes3-4_2_Bsplit.fastq.gz AP_phaw_S4_8kbmpl_04_B.fastq.gz
ln -s raw/Sample_4_AP_9kb_Lanes1-2_1_Asplit.fastq.gz AP_phaw_S4_9kbmpl_01_A.fastq.gz
ln -s raw/Sample_4_AP_9kb_Lanes1-2_1_Bsplit.fastq.gz AP_phaw_S4_9kbmpl_02_A.fastq.gz
ln -s raw/Sample_4_AP_9kb_Lanes1-2_2_Asplit.fastq.gz AP_phaw_S4_9kbmpl_01_B.fastq.gz
ln -s raw/Sample_4_AP_9kb_Lanes1-2_2_Bsplit.fastq.gz AP_phaw_S4_9kbmpl_02_B.fastq.gz
ln -s raw/Sample_4_AP_9kb_Lanes3-4_1_Asplit.fastq.gz AP_phaw_S4_9kbmpl_03_A.fastq.gz
ln -s raw/Sample_4_AP_9kb_Lanes3-4_1_Bsplit.fastq.gz AP_phaw_S4_9kbmpl_04_A.fastq.gz
ln -s raw/Sample_4_AP_9kb_Lanes3-4_2_Asplit.fastq.gz AP_phaw_S4_9kbmpl_03_B.fastq.gz
ln -s raw/Sample_4_AP_9kb_Lanes3-4_2_Bsplit.fastq.gz AP_phaw_S4_9kbmpl_04_B.fastq.gz'''
HTML(textarea('Commands for renamed symbolic links',renameCmds))
print 'Table for old and new files'
nameTable = ListTable()
nameTable.append(['Old Name','New Name'])
for line in renameCmds.split('\n'):
nameTable.append([line.strip().split()[2].split('/')[-1], line.strip().split()[-1]])
nameTable
Trimming at minimum length of 100 and quality of 30 with Trimmomatic
parallel -j 4 :::: trim.sh > trim.out 2> trim.err &
fs = [x for x in os.listdir('/home/share/ngsData/phaw/') if x.split('.')[-1] == 'gz' and x.find('sgl') != -1]
trimCmdPEA = 'java -jar /home/share/software/Trimmomatic-0.32/trimmomatic-0.32.jar PE -threads 6 '
trimCmdB = ' LEADING:__QUAL__ TRAILING:__QUAL__ MINLEN:100 ' + \
'ILLUMINACLIP:/home/share/software/Trimmomatic-0.32/adapters/adapters.fa:2:30:12:1:false TOPHRED33'
trimCmds = ''
inPrefix = '/home/share/ngsData/phaw/'
outPrefix = '/home/share/projects/phaw_genome/trim/'
for lib in list(set(['_'.join(f.split('_')[:-1]) for f in fs])):
a = lib + '_A'
b = lib + '_B'
phred = detectPhred(inPrefix + a + '.fastq.gz', 5000)
if phred == 'Sanger':
phred = '-phred33 '
else:
phred = '-phred64 '
#paired end
trimCmds += trimCmdPEA + phred + inPrefix + a + '.fastq.gz ' + inPrefix + b + '.fastq.gz ' + \
outPrefix + a + '.paired.trimmed.fastq.gz ' + outPrefix + a + '.unpaired.trimmed.fastq.gz ' + \
outPrefix + b + '.paired.trimmed.fastq.gz ' + outPrefix + b + '.unpaired.trimmed.fastq.gz' + trimCmdB + '\n'
qTrimCmds = trimCmds.replace('__QUAL__','30')
HTML(textarea('Trimmomatic bash commands at quality 30 filter',qTrimCmds))
The MPL libraries were made with the Nextera mate-pair library protocol. Here is how the protocol works briefly:
Adapters are added to the ends of the fragments
-adapter-|-----fragment-----|-adapter-
The fragment is then circularized resulting in the two adapters ligating together.
The circularized fragment is fragmented again. The fragment with the merged adapters is selected out.
--circularized fragment--|-adapter--adapter-|--circularized fragment--
This fragment with an adapter in the middle is then pair-end sequenced. The resulting pair is in reverse/forward orientation.
The adapter in the middle is called a junction adapter. The result of the sequencing can possibly produce an output where:
- Both read pairs contain adapter sequences
- Only one of the read pair contain adapter sequence
- Both read pairs contain no adapter sequences
Trimming Nextera adapters can be more complicated than normal trimming because it is possible for the pair-end sequencing to un-evenly sequence both ends resulting in one longer end that contains the entire junction adapter and also the other partial fragment:
-----------------------reverse read------------->
--circularized fragment--|-adapter--adapter-|--circularized fragment--
<--forward read------
This would result in a sequence where there is a junction adapter sequence inserted in the read.
The Nextclip program is used to clip Nextera MPL libraries. For cases where the read contains the entire junction adapter and partial fragment, the read will be clipped to the beginning of the junction adapter. Nextclip outputs 4 classes of reads:
A. Both read pairs contain adapter
B. Read 1 contain adapter
C. read 2 contain adapter
D. Both read pairs don't contain adapter
The Nextclip manual suggest to discard class D reads as they could come from circularized fragments without adapters, in which case the insert-size would be unknown or really short.
For contig assembly all 4 classes of clipped reads will be used as we don't care about the fragment size in contig assembly. For scaffolding, only class A, B, C, and D will be used.
The command used (minimum read length of 50 and remove PCR duplicates):
nextclip -d -m 50 -i A.fastq -j B.fastq -o libName
The following table presents stats on the Nextclip output:
clipStat = open(prefix + 'reads/clip.stat').read().strip()
blocks = clipStat.split('NextClip v1.3.1')[1:]
clipTable = ListTable()
clipTable.append(['Lib',\
'Total Pairs',\
'Duplicate Pairs',\
'Too Short',\
'Reads with N',\
'Cat. A','Cat. B',\
'Cat. C','Cat. D',\
'A + B + C','A + B + C + D'])
for block in blocks:
lines = [line for line in block.strip().split('\n') if line != '']
lib = ''
for line in lines:
if line != line.split()[0] == "Opening":
lib = line.strip().split()[-1].split('/')[-1].split('.')[0][:-2]
break
d = block.strip().split('SUMMARY')
if len(d) == 2:
data = dict([x.strip().split(": ") for x in d[1].split('\n')[:-1] if x != ''])
clipTable.append([lib,data['Number of read pairs']\
,data['Number of duplicate pairs'].split()[1]\
,data['All categories too short'].split()[1]\
,data['Number of pairs containing N'].split()[1]\
,data['A pairs long enough'].split()[1]\
,data['B pairs long enough'].split()[1]\
,data['C pairs long enough'].split()[1]\
,data['D pairs long enough'].split()[1]\
,data['Total usable pairs'].split()[1]\
,data['All long enough'].split()[1]])
clipTable
readStat = open(prefix + 'reads/reads.count')
rawShortBases = sum([int(x.split('\t')[1]) for x in open(prefix + 'reads/raw.baseCount').read().strip().split('\n')\
if x.split('\t')[0].find('shortsgl') != -1])
rawLongBases = sum([int(x.split('\t')[1]) for x in open(prefix + 'reads/raw.baseCount').read().strip().split('\n')\
if x.split('\t')[0].find('longsgl') != -1])
rawMPLBases = sum([int(x.split('\t')[1]) for x in open(prefix + 'reads/raw.baseCount').read().strip().split('\n')\
if x.split('\t')[0].find('kbmpl') != -1])
shortCount = [0,0]
longCount = [0,0]
mplCount = [0,0]
for line in readStat:
meta = line.strip().split()
if meta[0].find('mpl') != -1:
mplCount[0] += int(meta[1])
mplCount[1] += int(meta[2])
elif meta[0].find('shortsgl') != -1:
shortCount[0] += int(meta[1])
shortCount[1] += int(meta[2])
elif meta[0].find('longsgl') != -1:
longCount[0] += int(meta[1])
longCount[1] += int(meta[2])
print 'Read stats after trimming/clipping'
totalTable = ListTable()
totalTable.append(['Library Type',\
'# bases before trimming',\
'# bases after trimming/clipping',\
'Average trimmed/clipped read length'])
totalTable.append(['Short-SGL',\
commas(rawShortBases),\
commas(shortCount[0]),\
float(shortCount[0] / float(shortCount[1]))])
totalTable.append(['Long-SGL',\
commas(rawLongBases),\
commas(longCount[0]),\
float(longCount[0] / float(longCount[1]))])
totalTable.append(['MPL',\
commas(rawMPLBases),\
commas(mplCount[0]),\
float(mplCount[0] / float(mplCount[1]))])
totalTable
print 'Total bases:', commas(shortCount[0] + longCount[0] + mplCount[0])
print 'Estimated genome size: 3.6gb'
print 'Raw coverage:', (shortCount[0] + longCount[0] + mplCount[0]) / 3600000000
The following figures show k-mer spectra for short-sgl, long-sgl, mpls and all data together. "Good" k-mers are defined here as k-mers above the first coverage minima. K-mers below the first coverage minima are most likely sequencing errors which will probably be discarded by the assembler.
The intersection of the unique good k-mers of the three datasets shows high overlap, meaning the sequence information content among the three library types are consistent with each other. However, this does not show potential coverage bias among the three library types.
shortData = [(int(line.strip().split()[0]), int(line.strip().split()[1])) \
for line in open(prefix + 'spectra/shortsgl.30.histo')]
longData = [(int(line.strip().split()[0]), int(line.strip().split()[1])) \
for line in open(prefix + 'spectra/longsgl.30.histo')]
mplData = [(int(line.strip().split()[0]), int(line.strip().split()[1])) \
for line in open(prefix + 'spectra/mpl.30.histo')]
bbnormData = [(int(line.strip().split()[0]), int(line.strip().split()[1])) \
for line in open(prefix + 'spectra/bbnorm.30.histo')]
allData = [(int(line.strip().split()[0]), int(line.strip().split()[1])) \
for line in open(prefix + 'spectra/all.30.histo')]
pd.Series([x[1] for x in shortData], index = [x[0] for x in shortData])[:150].plot(kind='area', alpha=0.3,\
figsize=[15,6], ylim=[0,2e8])
pd.Series([x[1] for x in longData], index = [x[0] for x in longData])[:150].plot(kind='area', alpha=0.3,\
figsize=[15,6], ylim=[0,2e8])
pd.Series([x[1] for x in mplData], index = [x[0] for x in mplData])[:150].plot(kind='area', alpha=0.3,\
figsize=[15,6], ylim=[0,2e8])
pd.Series([x[1] for x in allData], index = [x[0] for x in allData])[:150].plot(kind='area', alpha=0.3,\
figsize=[15,6], ylim=[0,2e8])
plt.legend(['shortSGL','longSGL','MPL','All data'])
pd.Series([x[1] for x in allData], index = [x[0] for x in allData])[:200].plot(kind='area', alpha=0.3,\
figsize=[15,6], ylim=[0,0.35e8])
def getMetric(path):
histoData = [(int(line.strip().split()[0]), int(line.strip().split()[1])) for line in open(path)]
s = pd.Series([x[1] for x in histoData], index = [x[0] for x in histoData])
lowCov = 0
avgCov = 0
for i, v in enumerate(histoData):
if histoData[i + 1][1] > histoData[i][1]:
lowCov = histoData[i][0]
break
for i, v in enumerate(histoData[lowCov:]):
if histoData[lowCov + i + 1][1] < histoData[lowCov + i][1]:
avgCov = histoData[lowCov + i][0]
break
k = s * s.index
#mean read length - k-length + 1 / mean read length to normalize coverage
#assume mean read length is 145.
#145 - 30 + 1 / 145 = 0.8
return {'mincov':lowCov,\
'peakcov':avgCov,\
'uniqk':commas(int(s.sum())),\
'allk':commas(int(k.sum())),\
'erroruk':commas(int(s[:lowCov].sum())),\
'realuk':commas(int(s[lowCov:].sum())),\
'errork':commas(int(k[:lowCov].sum())),\
'realk':commas(int(k[lowCov:].sum())),\
'error':float(sum(k[:lowCov])) / sum(k) / 30,\
'size':int(k[lowCov:].sum() / avgCov)}
shortData = getMetric(prefix + 'spectra/shortsgl.30.histo')
longData = getMetric(prefix + 'spectra/longsgl.30.histo')
mplData = getMetric(prefix + 'spectra/mpl.30.histo')
allData = getMetric(prefix + 'spectra/all.30.histo')
allData['peakcov'] = 57
allData['size'] = int(allData['realk'].replace(',','')) / 57
bbData = getMetric(prefix + 'spectra/bbnorm.30.histo')
metricTable = ListTable()
metricTable.append(['Library','error coverage','peak coverage','all k-mers',\
'unique k-mers','error k-mers','good k-mers','error unique k-mers',\
'good unique k-mers'])
names =['shortSGL','longSGL','MPL','all','bbNorm']
for i,data in enumerate([(shortData,(147 - 30 + 1) / 147.0), \
(longData,(147 - 30 + 1) / 147.0), \
(mplData,(114 - 30 + 1) / 114.0), \
(allData,(130 - 30 + 1) / 130.0)]):
d = data[0]
s = commas(int(data[1] * d['size']))
metricTable.append([names[i],\
d['mincov'],\
d['peakcov'],\
d['allk'],\
d['uniqk'],\
d['errork'],\
d['realk'],\
d['erroruk'],\
d['realuk']])
metricTable
histoData = [(int(line.strip().split()[0]), int(line.strip().split()[1])) for line in open(prefix + 'spectra/all.30.histo')]
s = pd.Series([x[1] for x in histoData], index = [x[0] for x in histoData])
k = s * s.index
print 'estimated genome size not taking into account heterozygosity:',\
commas(int(k[15:101].sum() / 57 + k[100:].sum() / 114))
print 'estimated genome size taking into account heterozygosity:', \
commas(int(k[15:101].sum() / 57 / 2 + k[100:].sum() / 114))
from matplotlib_venn import venn3, venn2
'''
A = shortsgl
B = longsgl
C = mpl
Abc = 9142176
aBc = 35819919
ABc = 59846436
abC = 43811369
AbC = 88180851
aBC = 8740025
ABC = 2098929438
'''
plt.figure(figsize=(6,6))
venn3(subsets = (9142176, 35819919, 59846436, 43811369, 88180851, 8740025, 2098929438), \
set_labels = ('Short-SGL', 'Long-SGL', 'MPL'))
setTable = ListTable()
setTable.append(['library','good unique k-mers','percent in intersection'])
setTable.append(['Short-SGL',2256098901,100 * float(2098929438) / 2256098901])
setTable.append(['Long-SGL',2203335818,100 * float(2098929438) / 2203335818])
setTable.append(['MPL',2239661683,100 * float(2098929438) / 2239661683])
setTable
Contig assembly was performed using Abyss on Amazon cloud instances. StarCluster was used to automatically setup a cluster environment (Sun-grid engine).
The following are setup instructions for setting a cluster of 21 nodes (small head node + 20 high CPU nodes) and running Abyss.
Install StarCluster by:
sudo pip install starcluster
The following config was used (~/.starcluster/config) to start up 20 c3.8xlarge (32 CPUs, 60 gb RAM each) instances. Master node is a m3.xlarge.
[global]
DEFAULT_TEMPLATE=abyss
[aws info]
AWS_ACCESS_KEY_ID = KEY
AWS_SECRET_ACCESS_KEY = KEY
AWS_USER_ID= ID
AWS_REGION_NAME = eu-west-1
AWS_REGION_HOST = ec2.eu-west-1.amazonaws.com
[key dk]
KEY_LOCATION=~/.ssh/dk.pem
[cluster abyss]
KEYNAME = dk
CLUSTER_SIZE = 21
CLUSTER_USER = sgeadmin
CLUSTER_SHELL = bash
NODE_IMAGE_ID = ami-ca4abfbd
NODE_INSTANCE_TYPE = c3.8xlarge
MASTER_INSTANCE_TYPE = m3.xlarge
MASTER_IMAGE_ID = ami-ca4abfbd
AVAILABILITY_ZONE = eu-west-1a
VOLUMES = data
SPOT_BID = 3.0
[volume data]
VOLUME_ID = vol-f0daa6f7
MOUNT_PATH = /working
Important things to note about StarCluster:
To start the cluster:
starcluster start abyss
To ssh into cluster as root:
starcluster sshmaster abyss
To terminate cluster:
starcluster terminate abyss
To install abyss:
sudo apt-get install -y sparsehash
wget http://sourceforge.net/projects/boost/files/boost/1.56.0/boost_1_56_0.zip/download
mv download boost.zip
unzip boost.zip
git clone https://github.com/bcgsc/abyss.git
cd abyss
./autogen.sh
./configure --enable-maxk=128 --prefix=INSTALL_PATH --with-boost=/root/boost_1_56_0/ --with-mpi=/usr/lib/openmpi
Add abyss to the PATH
variable:
echo 'PATH=$PATH:/working/software/bin' >> ~/.bashrc
source ~/.bashrc
To get the Abyss-pe run commands, use the --dry-run
parameter:
/working/software/bin/abyss-pe k=70 np=640 j=32 q=30 lib='shortsgl longsgl' shortsgl='/working/reads/AP_phaw_S4_shortsgl_*' longsgl='/working/reads/AP_phaw_S4_longsgl_*' name='pilot' --dry-run > abyss.commands
k
specifies k-mernp
specifies process. Number of nodes * number of cpus per nodej
specifies number of cpus per nodeq
quality trimmingTake the mpirun
line from the dry run and make a abyss.mpi.sh
file. Make it executable:
chmod a+x abyss.mpi.sh
Qsub the command:
qsub -V -cwd -pe orte 640 ./abyss.mpi.sh
2 different k-mer assemblies (k120 and k70) were peformed with all the reads.
In addition, merging of the two genome assemblies was also performed with GAM-NGS. GAM-NGS works by designating a master and slave assembly. The slave assembly is then used to merge master assembly's contigs. All master assembly contigs will be outputted; however slave contigs not found in the master asembly is not used and also not outputted. The k120 assembly was merged to the k70 assembly.
The GAM-NGS merging was done by first mapping short-sgl reads as PE to each contig assembly:
bowtie2 -X 600 --no-unal --no-mixed --no-discordant --no-dovetail --no-contain --no-overlap -p 32 -x indexFile -1 pairA -2 pairB | samtools view -Shb - > output.bam
parallel samtools sort -m 5G -O bam -T {.} -@ 32 {} \> {.}.sorted.bam ::: *.bam
parallel samtools index {} ::: *.sorted.bam
An info file is created for each contig assembly listing where the .bam file are and estimated insert sizes:
Then gam-create and gam-merge are used with the info files:
gam-create --master-bam /phaw/gam/bam/k120/k120.info --slave-bam /phaw/gam/bam/k70/k70.info --min-block-size 10 --output k120_k70
gam-merge --master-bam /phaw/gam/bam/k120/k120.info --slave-bam /phaw/gam/bam/k70/k70.info --min-block-size 50 --blocks-file k120_k70.blocks --master-fasta abyss_k120.ctg.renamed.filtered.fa --slave-fasta abyss_k70.ctg.renamed.filtered.fa --threads 32 --output k120_k70.gam 2> merge.err
Jellyfish was ran on k120 and k70 assemblies. Using the k-mer count output, we can calculate k-mer coverage for each contig sequence. If a contig is totally unique, we should see a constant k-mer coverage of 1 throughout the contig. Areas of high k-mer coverage represent regions that appear in many other places in the contig assembly (repetitive regions). We expect contigs with high k-mer coverage to be representative of regions of high read coverage.
The following figure plots the length of each contig vs the average coverage across the contig. K-mer coverage is capped at 100 making the y-axis bounded from 1 to 100. We notice artifacts in coverage for k-mers at k 2 length. This is likely due to the assembler designating filtering thresholds at k 2.
from Bio import SeqIO
from collections import defaultdict
import math
def nX(lengths):
bpCount = 0
totalCount = sum(lengths)
nx = {}
for l in lengths:
bpCount += l
xPercent = math.floor(float(bpCount) / totalCount * 100)
if not nx.has_key(xPercent):
nx[xPercent] = l
nx = nx.items()
nx.sort(key = lambda x : x[0])
return nx
abyss_k120_stat = pd.read_csv(open(prefix + 'kmer/abyss_k120.ctg.renamed.covStat'),sep='\t',index_col=0)
abyss_k120_stat.columns = ['min','max','avg','std','length']
import numpy as np
abyss_k120_sample = abyss_k120_stat.loc[np.random.choice(abyss_k120_stat.index, 100000, replace=False)]
print 'Bounded 1-100 abyss k120 assembly (subsampled 100000) length by k-mer coverage plot'
fig, axes = plt.subplots(ncols = 2, nrows = 1, figsize=[18,5])
abyss_k120_sample[abyss_k120_sample['length'] < 1000].plot(ax=axes[0],kind='scatter', x='length',y='avg',alpha=0.1, s = 2)
abyss_k120_sample['avg'].hist(bins = 100, ax = axes[1])
axes[0].set_title('length vs average coverage')
axes[1].set_title('histogram of avg coverage')
Given an average k-mer coverage of ~50 across a 100bp contig. This can possibly represent a constant coverage of 50 across the entire contig. Or this can mean a coverage of 1 across half of the contig and coverage of 100 across the other half.
We can calculate the potential maximum number of bases with "good" coverage (We define good coverage as 3 or less) by:
max low coverage (MLC) length = length - (Length * avg coverage / 97)
Contigs with short MLC length can be filtered out because they are mostly repetitive. Most of the raw reads are ~140 bp in length, so we can filter out any contigs with MLC length of less than 140.
abyss_k120_sample_mlc = abyss_k120_sample['length'] - (abyss_k120_sample['length'] * abyss_k120_sample['avg'] / 97)
abyss_k120_sample_lengthFilter = abyss_k120_sample[abyss_k120_sample['length'] > 239]
abyss_k120_sample_length_mclFilter = abyss_k120_sample\
[~abyss_k120_sample.index.isin(abyss_k120_sample_mlc[abyss_k120_sample_mlc < 140].index)]\
[abyss_k120_sample['length'] > 239]
fig, axes = plt.subplots(ncols = 2, nrows = 1, figsize=[18,5])
ax = abyss_k120_sample\
[~abyss_k120_sample.index.isin(abyss_k120_sample_length_mclFilter.index)]\
[abyss_k120_sample['length'] < 1000]\
.plot(ax = axes[1], color=sns.color_palette()[0], kind='scatter', x='length',y='avg',alpha=0.1, s = 2)
abyss_k120_sample_lengthFilter\
[abyss_k120_sample_lengthFilter['length'] < 1000]\
.plot(color=sns.color_palette()[1],ax = ax, kind='scatter', x='length',y='avg',alpha=0.1, s = 2)
abyss_k120_sample_length_mclFilter\
[abyss_k120_sample_length_mclFilter['length'] < 1000]\
.plot(color=sns.color_palette()[2],ax = ax, kind='scatter', x='length',y='avg',alpha=0.1, s = 2)
abyss_k120_sample['avg'].hist(ax = axes[0], bins = 100).set_ylim([0,9000])
abyss_k120_sample_lengthFilter['avg'].hist(ax = axes[0], bins = 100)
abyss_k120_sample_length_mclFilter['avg'].hist(ax = axes[0], bins = 100)
axes[0].legend(['all','length > 239','length > 239 and MCL length >= 140'])
abyss_k120_mlc = abyss_k120_stat['length'] - (abyss_k120_stat['length'] * abyss_k120_stat['avg'] / 97)
abyss_k120_lengths = abyss_k120_stat\
[~abyss_k120_stat.index.isin(abyss_k120_mlc[abyss_k120_mlc < 140].index)]\
[abyss_k120_stat['length'] > 239]['length'].copy()
abyss_k120_lengths.sort(ascending = False)
abyss_k120_nx = nX(abyss_k120_lengths)
print 'Total number of contigs:', commas(len(abyss_k120_stat))
print 'Total bases:', commas(int(sum(abyss_k120_stat['length'])))
print
print '# contigs after filtering length > 239:', \
commas(len(abyss_k120_stat[abyss_k120_stat['length'] > 239]))
print '# bases after filtering length > 239:', \
commas(int(sum((abyss_k120_stat[abyss_k120_stat['length'] > 239]['length']))))
print '# contigs after filtering MCL length >= 140:', \
commas(len(abyss_k120_stat\
[~abyss_k120_stat.index.isin(abyss_k120_mlc[abyss_k120_mlc < 140].index)]\
[abyss_k120_stat['length'] > 239]))
print '# bases after filtering MCL length >= 140:', \
commas(int(sum(abyss_k120_stat\
[~abyss_k120_stat.index.isin(abyss_k120_mlc[abyss_k120_mlc < 140].index)]\
[abyss_k120_stat['length'] > 239]['length'])))
print
for i in range(10):
print 'N' + str(i * 10) + ':', commas(int(abyss_k120_nx[i * 10][1]))
abyss_k70_stat = pd.read_csv(open(prefix + 'kmer/abyss_k70.ctg.renamed.covStat'),sep='\t',index_col=0)
abyss_k70_stat.columns = ['min','max','avg','std','length']
import numpy as np
abyss_k70_sample = abyss_k70_stat.loc[np.random.choice(abyss_k70_stat.index, 100000, replace=False)]
print 'Bounded 1-100 abyss k70 assembly (subsampled 100000) length by k-mer coverage plot'
fig, axes = plt.subplots(ncols = 2, nrows = 1, figsize=[18,5])
abyss_k70_sample[abyss_k70_sample['length'] < 1000].plot(ax=axes[0],kind='scatter', x='length',y='avg',alpha=0.1, s = 2)
abyss_k70_sample['avg'].hist(bins = 100, ax = axes[1])
axes[0].set_title('length vs average coverage')
axes[1].set_title('histogram of avg coverage')
abyss_k70_sample_mlc = abyss_k70_sample['length'] - (abyss_k70_sample['length'] * abyss_k70_sample['avg'] / 97)
abyss_k70_sample_lengthFilter = abyss_k70_sample[abyss_k70_sample['length'] > 139]
abyss_k70_sample_length_mclFilter = abyss_k70_sample\
[~abyss_k70_sample.index.isin(abyss_k70_sample_mlc[abyss_k70_sample_mlc < 140].index)]\
[abyss_k70_sample['length'] > 139]
fig, axes = plt.subplots(ncols = 2, nrows = 1, figsize=[18,5])
ax = abyss_k70_sample\
[~abyss_k70_sample.index.isin(abyss_k70_sample_length_mclFilter.index)]\
[abyss_k70_sample['length'] < 1000]\
.plot(ax = axes[1], color=sns.color_palette()[0], kind='scatter', x='length',y='avg',alpha=0.1, s = 2)
abyss_k70_sample_lengthFilter\
[abyss_k70_sample_lengthFilter['length'] < 1000]\
.plot(color=sns.color_palette()[1],ax = ax, kind='scatter', x='length',y='avg',alpha=0.1, s = 2)
abyss_k70_sample_length_mclFilter\
[abyss_k70_sample_length_mclFilter['length'] < 1000]\
.plot(color=sns.color_palette()[2],ax = ax, kind='scatter', x='length',y='avg',alpha=0.1, s = 2)
abyss_k70_sample['avg'].hist(ax = axes[0], bins = 100).set_ylim([0,9000])
abyss_k70_sample_lengthFilter['avg'].hist(ax = axes[0], bins = 100)
abyss_k70_sample_length_mclFilter['avg'].hist(ax = axes[0], bins = 100)
axes[0].legend(['all','length > 239','length > 139 and MCL length >= 140'])
abyss_k70_mlc = abyss_k70_stat['length'] - (abyss_k70_stat['length'] * abyss_k70_stat['avg'] / 97)
abyss_k70_lengths = abyss_k70_stat\
[~abyss_k70_stat.index.isin(abyss_k70_mlc[abyss_k70_mlc < 140].index)]\
[abyss_k70_stat['length'] > 139]['length'].copy()
abyss_k70_lengths.sort(ascending = False)
abyss_k70_nx = nX(abyss_k70_lengths)
print 'Total number of contigs:', commas(len(abyss_k70_stat))
print 'Total bases:', commas(int(sum(abyss_k70_stat['length'])))
print
print '# contigs after filtering length > 139:', \
commas(len(abyss_k70_stat[abyss_k70_stat['length'] > 139]))
print '# bases after filtering length > 139:', \
commas(int(sum((abyss_k70_stat[abyss_k70_stat['length'] > 139]['length']))))
print '# contigs after filtering MCL length >= 140:', \
commas(len(abyss_k70_stat\
[~abyss_k70_stat.index.isin(abyss_k70_mlc[abyss_k70_mlc < 140].index)]\
[abyss_k70_stat['length'] > 139]))
print '# bases after filtering MCL length >= 140:', \
commas(int(sum(abyss_k70_stat\
[~abyss_k70_stat.index.isin(abyss_k70_mlc[abyss_k70_mlc < 140].index)]\
[abyss_k70_stat['length'] > 139]['length'])))
print
for i in range(10):
print 'N' + str(i * 10) + ':', commas(int(abyss_k70_nx[i * 10][1]))
Summary of the k120, k70 and merged K120 + k70 assemblies:
fs = [x for x in os.listdir(prefix + 'contig') if x.split('.')[-1] == 'length']
lengths = {}
nxs = {}
for f in fs:
libLength = [int(x.split()[1]) for x in open(prefix + 'contig/' + f).read().strip().split('\n')]
libLength.sort(reverse = True)
libName = ' '.join(f.split('.')[0].split('_'))
lengths[libName] = libLength
nxs[libName] = nX(libLength)
libs = lengths.keys()
indLibs = ['abyss k120','abyss k70','k120 k70']
summaryTable = ListTable()
summaryTable.append([''] + indLibs)
summaryTable.append(['#contigs'] + [commas(len(lengths[lib])) for lib in indLibs])
summaryTable.append(['#max length'] + [str(lengths[lib][0]) for lib in indLibs])
summaryTable.append(['#sum bases'] + [commas(sum(lengths[lib])) for lib in indLibs])
for i in range(10):
summaryTable.append(['N' + str(i * 10)] + [str(nxs[lib][i * 10][1]) for lib in indLibs])
summaryTable
The abyss k120 contig assembly was used for mapping pair-end reads against to estimate fragment size.
The term "insert size" and "fragment size" seems to be used interchangeably in most literature. Insert size should represent the length of the distance between reads. Fragment size should represent the length of the entire fragment (insert size + length of reads on both ends). With this analysis, we estimated the fragment size.
The Short-SGL/Long-SGL libraries' paired-reads are in forward/reverse orientation while the MPL libraries pair-reads are in reverse/forward orientation. Scaffolding was done with SSPACE after estimating fragment size.
The bowtie2 command used to map reads as single end was:
bowtie2 --very-sensitive -p 32 --no-hd --no-sq --no-unal -x db -U lib.gz > lib.sam
#parallel echo commands
parallel echo "bowtie2 --very-sensitive -p 32 --no-hd --no-sq --no-unal -x prelim -U {} \> {/.}.sam" :::: fs
A python script process.py
was used to loop through the resulting .SAM files to collect insert size data.
The following figure shows histogram of insert sizes for each library. The MPLs are separated into two types. The "with adapters" are paired reads where at least one of the pair had an junction adapter sequence (category A + B + C in nextclip). "No adapters" are paired reads where no junction adapters were found for the paired reads (category D in nextclip). Reads with no junction adapters are recommended to be discarded by Nextclip because they potentially could be a random sub-fragment from the circularized fragment.
def fragmentSize(path):
rf = []
fr = []
ff = []
rr = []
inFile = open(path)
count = 0
for line in inFile:
meta = line.strip().split()
aOrient = meta[1]
bOrient = meta[6]
aStart = int(meta[2])
aLen = int(meta[3])
bStart = int(meta[7])
bLen = int(meta[8])
if aLen >= 50 and bLen >= 50: #only look at reads that have >= 50bp mapping
if aOrient == '+':
if bOrient == '-':
#if read A is + and read B is -, then coordinate of read B needs to be < coordinate of read A
if bStart < aStart:
rf.append(aStart + aLen - bStart)
else:
fr.append(bStart + bLen - aStart)
else:
if bStart < aStart:
ff.append(aStart + aLen + bStart)
else:
ff.append(bStart + bLen + aStart)
else:
if bOrient == '+':
#if read A is - and read B is +, then coordinate of read A needs to be > coordinate of read B
if aStart > bStart:
rf.append(bStart + bLen - aStart)
else:
fr.append(aStart + aLen - bStart)
else:
if bStart < aStart:
rr.append(aStart + aLen + bStart)
else:
rr.append(bStart + bLen + aStart)
count += 1
if count == 500000:
break
inFile.close()
return (fr, rf, ff, rr)
fs = [x for x in os.listdir(prefix + 'fragsize') if x.split('.')[-1] == 'data']
dists = {}
for f in fs:
name = f.split('.')[0]
orient = 0
if len(name.split('_')) == 2:
orient = 1
if name.split('_')[1] == 'D':
name = name.split('_')[0] + ', no adpt.'
else:
name = name.split('_')[0] + ', with adpt.'
dists[name] = pd.Series(fragmentSize(prefix + 'fragsize/' + f)[orient])
ks = dists.keys()
ks.sort()
for k in ks:
dists[k]
fig, axes = plt.subplots(ncols = 2, nrows = 5, figsize=[15,15])
for i, d in enumerate(ks):
col = i % 2
row = i / 2
pd.Series(dists[d]).hist(bins=150, ax = axes[row, col])
axes[row,col].set_title(ks[i])
if d == 'shortsgl' or d == 'longsgl':
axes[row,col].set_xlim(0,1000)
else:
axes[row,col].set_xlim(0,20000)
fig.tight_layout()
The longSGL library seems to have a spread of insert sizes. We will not use this library in the scaffolding process. But we will use it for contigging. We would expect the MPL with no adapaters to contain more erroneous fragment sizes, but that doesn't seem like the case as the no adapters and with adapters fragment sizes are very similar. There is slightly more small fragment sizes in the no adapters paired reads. For scaffolding, we will include the reads with no adapters, against the recommendations of Nextclip.
Manually bound the peaks to get an estimate of mean and standard deviation fragment size.
shortsgl = dists['shortsgl']
kb6mpl = dists['6kbmpl, with adpt.']
kb8mpl = dists['8kbmpl, with adpt.']
kb9mpl = dists['9kbmpl, with adpt.']
kb20mpl = dists['20kbmpl, with adpt.']
fig, axes = plt.subplots(ncols = 2, nrows = 3, figsize=[15,10])
shortsgl[shortsgl < 700][shortsgl > 150].hist(bins=50, ax = axes[0,0]).set_title('shortsgl')
kb6mpl[kb6mpl < 7000][kb6mpl > 4300].hist(bins=100, ax = axes[0,1]).set_title('6kbmpl')
kb8mpl[kb8mpl < 9000][kb8mpl > 6000].hist(bins=100, ax = axes[1,0]).set_title('8kbmpl')
kb9mpl[kb9mpl < 11500][kb9mpl > 7500].hist(bins=100, ax = axes[1,1]).set_title('9kbmpl')
kb20mpl[kb20mpl < 16500][kb20mpl > 11500].hist(bins=100, ax = axes[2,0]).set_title('20kbmpl')
fragTable = ListTable()
fragTable.append(['Library','Mean fragment size','Standard Dev. fragment size'])
fragTable.append(['Short-SGL',\
str(shortsgl[shortsgl < 700][shortsgl > 150].mean()),\
str(shortsgl[shortsgl < 700][shortsgl > 150].std())])
fragTable.append(['6kb-MPL',\
str(kb6mpl[kb6mpl < 7000][kb6mpl > 4300].mean()),\
str(kb6mpl[kb6mpl < 7000][kb6mpl > 4300].std())])
fragTable.append(['8kb-MPL',\
str(kb8mpl[kb8mpl < 9000][kb8mpl > 6000].mean()),\
str(kb8mpl[kb8mpl < 9000][kb8mpl > 6000].std())])
fragTable.append(['9kb-MPL',\
str(kb9mpl[kb9mpl < 11500][kb9mpl > 7500].mean()),\
str(kb9mpl[kb9mpl < 11500][kb9mpl > 7500].std())])
fragTable.append(['20kb-MPL',\
str(kb20mpl[kb20mpl < 16500][kb20mpl > 11500].mean()),\
str(kb20mpl[kb20mpl < 16500][kb20mpl > 11500].std())])
fragTable
The K-mer spectra of all reads shows a strong peak at ~57 multiplicity and a large shoulder extending to double that coverage. This indicates that a portion of the k-mers that will be used in the contig assembly is covered at twice the coverage of another portion.
By mapping all the reads back to the assembled/merged contigs, we see a distinct bimodal distribution where a portion of the contigs (~100 coverage) are covered at twice the coverage of another (~50 coverage).
from Bio import SeqIO
from collections import defaultdict
import math
def nX(lengths):
bpCount = 0
totalCount = sum(lengths)
nx = {}
for l in lengths:
bpCount += l
xPercent = math.floor(float(bpCount) / totalCount * 100)
if not nx.has_key(xPercent):
nx[xPercent] = l
nx = nx.items()
nx.sort(key = lambda x : x[0])
return nx
inFile = open('/home/share/projects/phaw_genome/draft_3.0/assembly/segmentDuplication/k120_k70.gam.format.data')
df_data = {'cov':[],'aln':[],'ident':[],'length':[]}
df_index = []
for line in inFile:
meta = line.strip().split()
cov = float(meta[2])
#was calculated as the number of matches / alignment length (bigger of the query/hit, includes gaps)
ident = float(meta[-2])
aln = min(1.0,float(meta[3]) / int(meta[1]))
df_index.append(meta[0])
df_data['cov'].append(cov)
df_data['aln'].append(aln)
df_data['ident'].append(ident)
df_data['length'].append(int(meta[1]))
df_data = pd.DataFrame(df_data)
df_data.index = df_index
leg = []
for i in [0,500,1000,5000]:
leg.append('length > ' + str(i))
df_data[df_data['cov'] < 200][df_data['length'] > i]['cov'].hist(bins=100, figsize=[10,6])
plt.legend(leg,prop={'size':13})
Here is the k-mer spectra for all data:
pd.Series([x[1] for x in allData], index = [x[0] for x in allData])[:200].plot(kind='area', alpha=0.3,\
figsize=[10,6], ylim=[0,0.35e8])
histoData = [(int(line.strip().split()[0]), int(line.strip().split()[1])) for line in open('/home/share/projects/phaw_genome/draft_2.3/spectra/all.30.histo')]
s = pd.Series([x[1] for x in histoData], index = [x[0] for x in histoData])
k = s * s.index
print 'All K-mers:', commas(int(sum(k)))
print 'All unique K-mers:', commas(int(sum(s)))
print
print 'K-mers below 15 multiplicity (sequencing errors):', commas(int(sum(k[:15])))
print 'Unique K-mers below 15 multiplicity (sequencing errors):', commas(int(sum(s[:15])))
print
print 'K-mers above 180 multiplicity (repetitive regions):', commas(int(sum(k[180:])))
print 'Unique k-mers above 180 multiplicity (repetitive regions):', commas(int(sum(s[180:])))
print
print 'K-mers between 15-180 multiplicity:', commas(int(sum(k[15:180])))
print 'Unique K-mers between 15-180 multiplicity:', commas(int(sum(s[15:180])))
print
print 'Percentage of all k-mers between 15-180 multiplicity:', sum(k[15:180]) / float(sum(k)) * 100
print 'Average k-mer coverage between 15-180 multiplicity:',sum(k[15:180]) / float(sum(s[15:180]))
This bimodal distribution could possibly indicate high heterozygosity (contigs at ~50 coverage are heterozygous alleles) or segmental duplication (contigs at ~100 coverage are recently duplicated loci in the genome).
LAST was used to align the all the contigs against each other. An alignment percentage and identity was calculated for each contig by:
The following graph shows the proportion of contigs at various aignment %:
df_data[df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['cov'] < 200][df_data['aln'] >= 0.7]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['cov'] < 200][df_data['aln'] >= 0.8]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['cov'] < 200][df_data['aln'] >= 0.9]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['cov'] < 200][df_data['aln'] >= 0.95]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['cov'] < 200][df_data['aln'] == 1]['cov'].hist(bins=150, figsize=[10,6])
plt.legend(['all contigs',\
'at least 70% alignment to another contig',\
'at least 80% alignment to another contig',\
'at least 90% alignment to another contig',\
'at least 95% alignment to another contig',\
'100% alignment to another contig'],prop={'size':14})
df_data['aln'][df_data['aln'] > 0.5].hist(bins=50,figsize=[10,5])
There is a peak at >= ~95% alignment %.
The following is the distribution of identities for contigs >= 95% alignment. There is a sharp drop off at ~97% identity. Any contigs that are >= 97% indentity and >= 95% alignment were probably collapsed into one contig during the assembly process.
df_data[df_data['aln'] >= 0.95]['ident'].hist(bins= 100,figsize=[10,6])
Here are the proportions of contigs that have >= 95% alignment at various identities.
df_data[df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['ident'] >= 0.7][df_data['aln'] >= 0.95][df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['ident'] >= 0.8][df_data['aln'] >= 0.95][df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['ident'] >= 0.9][df_data['aln'] >= 0.95][df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_data[df_data['ident'] >= 0.95][df_data['aln'] >= 0.95][df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
plt.legend(['all contigs',\
'at least 70% identity',\
'at least 80% identity',\
'at least 90% identity',\
'at least 95% identity',],prop={'size':14})
print 'Number of contigs total:', commas(len(df_data['length']))
print 'Sum bases of contigs:', commas(int(sum(df_data['length'])))
print
print 'Number of contigs between 10-110 (lower covered contigs):', \
commas(len(df_data[df_data['cov']>=10][df_data['cov']<=110]['length']))
print 'Sum bases of contigs between 10-110 (lower covered contigs):', \
commas(int(sum(df_data[df_data['cov']>=10][df_data['cov']<=110]['length'])))
print
print 'Number of contigs >= 95% alignment:', commas(len(df_data[df_data['aln'] >= 0.95]['length']))
print 'Sum bases of contigs >= 95% alignment:', commas(int(sum(df_data[df_data['aln'] >= 0.95]['length'])))
print 'Percentage of lower covered contigs:', \
100 * float(sum(df_data[df_data['aln'] >= 0.95]['length'])) / sum(df_data[df_data['cov']>=10][df_data['cov']<=110]['length'])
print
print 'Number of contigs >= 95% alignment and >= 97% identity:', \
commas(len(df_data[df_data['aln'] >= 0.95][df_data['ident'] >= 0.97]))
print 'Sum bases of contigs >= 95% alignment and >= 97% identity:', \
commas(int(sum(df_data[df_data['aln'] >= 0.95][df_data['ident'] >= 0.97]['length'])))
print 'Percentage of lower covered contigs:', \
100 * float(sum(df_data[df_data['aln'] >= 0.95]\
[df_data['ident'] >= 0.97]['length'])) / sum(df_data[df_data['cov']>=10][df_data['cov']<=110]['length'])
print 'Sum bases of contigs between 10-110 (lower covered contigs):', commas(int(sum(df_data[df_data['cov']<=110]['length'])))
Used CD-HIT to collapse contigs that are at least 96% identity of another.
cdhitFile = open('/home/share/projects/phaw_genome/draft_3.0/assembly/contig/k120_k70.gam.format.cdhit.fa.clstr')
blocks = cdhitFile.read().strip().split('>Cluster')[1:]
ctgLengths = {}
for line in open('/home/share/projects/phaw_genome/draft_3.0/assembly/segmentDuplication/k120_k70.gam.format.lengths'):
meta = line.strip().split()
ctgLengths[meta[0]] = int(meta[1])
cdhit_clusters = {}
cdhit_members = []
for block in blocks:
lines = block.strip().split('\n')
primary = 0
members = []
for line in lines[1:]:
cid = line.strip().split('>')[-1].split('...')[0]
status = line.strip().split('...')[-1].strip()
if status == '*':
primary = cid
else:
members.append(cid)
cdhit_clusters[primary] = members
cdhit_members.extend(members)
print 'Number of contigs total:', commas(len(ctgLengths))
print 'Sum bases of contigs:', commas(sum(ctgLengths.values()))
print
print 'Number of contigs after CD-HIT:', commas(len(cdhit_clusters))
print 'Sum bases of contig after CD-HIT:', commas(sum([ctgLengths[k] for k in cdhit_clusters.keys()]))
print
print 'Number of contigs collapsed:', commas(sum([len(v) for v in cdhit_clusters.values()]))
print 'Sum bases of contigs collapsed:', commas(sum([ctgLengths[x] for x in cdhit_members]))
cdhit_member_lengths = pd.Series([ctgLengths[x] for x in cdhit_members])
print 'Mean length of cdhit collapsed contigs:', cdhit_member_lengths.mean()
print 'Median length of cdhit collapsed contigs:', cdhit_member_lengths.median()
df_cdhit = df_data.loc[cdhit_members]
df_data[df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_cdhit[df_cdhit['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
plt.legend(['All contigs','Collapsed contigs'],prop={'size':13})
Primary sequences that were kept for each CD-HIT cluster
df_primary = df_data.loc[[k for k,v in cdhit_clusters.items() if len(v) > 0]]
df_data[df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_primary[df_primary['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_cdhit = df_data.loc[cdhit_members]
df_data[df_data['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_cdhit[df_cdhit['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
df_primary[df_primary['cov'] < 200]['cov'].hist(bins=150, figsize=[10,6])
plt.legend(['All contigs','Collapsed contigs','primary contigs'],prop={'size':13})
df_filtered = df_data.loc[list(set(df_data.index) - set(cdhit_members))]
leg = []
for i in [0,500,1000,5000]:
leg.append('length > ' + str(i))
df_filtered[df_filtered['cov'] < 200][df_filtered['length'] > i]['cov'].hist(bins=150, figsize=[10,6])
plt.legend(leg,prop={'size':13})
The scaffolding process starts by first remapping reads back the contig assembly as single ended reads using bowtie2:
bowtie2 --very-sensitive -p 32 --no-hd --no-sq --no-unal -x database -U lib.fastq.gz > lib.sam
Links are extracted from the resilting .sam files:
python extractLinks.py a.sam b.sam output
This script outputs two files: output.diffRef.data
, output.sameRef.data
Fragment size can be estimated from the output.sameRef.data
file. The output.diffRef.data
file can be used for scaffolding.
A masking process is then performed to filter out reads that mapped to regions of high coverage. Even though only uniquely mapped reads are used, bowtie2 does allow for mismatches, resulting in possible pseudo-unique mappings to alternate contigs assembled at regions of high coverage.
A better method is to probably use Myer's A-statistic (log odds ratio of contig being unique vs collapsed repetitive region).
This masking process starts by first using jellyfish to get 30mers from the contig assembly. The the following scripts are used to mask the output.diffRef.data
file:
jellyfish count -C -m 30 -t 30 -s 1000000000 -o out.kmer.30.jf contigs.fa
jellyfish dump -L 2 -o out.kmer.30.fa out.kmer.30.jf
python getKmerCoverage.py out.kmer.30.fa contigs.fa contigs.cov
python maskData contigs.cov output.diffRef.data > output.diffRef.masked.data
The links are then converted to a tab delimited format which SSPACE can accept. SSPACE was then used to the scaffold the contigs using the following config:
shortsgl TAB shortsgl.diffRef.masked.sspace.data 431 0.22 FR
6kbmpl TAB 6kbmpl.contig.paired.diffRef.masked.sspace.data 5544 0.1 RF
8kbmpl TAB 8kbmpl.contig.paired.diffRef.masked.sspace.data 7369 0.08 RF
9kbmpl TAB 9kbmpl.contig.paired.diffRef.masked.sspace.data 9395 0.08 RF
20kbmpl TAB 20kbmpl.contig.paired.diffRef.masked.sspace.data 13905 0.06 RF
from Bio import SeqIO
from collections import defaultdict
import math
def faStats(path):
faFile = open(path)
lengths = []
nCount = []
ungapped_lengths = []
for record in SeqIO.parse(faFile,'fasta'):
seq = str(record.seq).lower()
ns = seq.count('n')
nCount.append(ns)
lengths.append(len(seq))
ungapped_lengths.append(len(seq) - ns)
faFile.close()
print '---with Ns----'
print 'Sum bases: ' + str(commas(sum(lengths)))
print 'Max length: ' + str(commas(max(lengths)))
print 'Min length: ' + str(commas(min(lengths)))
print 'Number of Ns total: ' + str(commas(sum(nCount)))
print 'Avg number of Ns per scaffold: ' + str(sum(nCount) / float(len(nCount)))
lengths.sort(reverse = True)
scfNx = nX(lengths)
print
for i in range(10):
print 'N' + str(i * 10) + ':', commas(int(scfNx[i * 10][1]))
print
print '---without Ns----'
print 'Sum bases: ' + str(commas(sum(ungapped_lengths)))
print 'Max length: ' + str(commas(max(ungapped_lengths)))
print 'Min length: ' + str(commas(min(ungapped_lengths)))
ungapped_lengths.sort(reverse = True)
scfNx = nX(ungapped_lengths)
print
for i in range(10):
print 'N' + str(i * 10) + ':', commas(int(scfNx[i * 10][1]))
print
def plotLength(path):
faFile = open(path)
lengths = []
nCount = []
ungapped_lengths = []
for record in SeqIO.parse(faFile,'fasta'):
seq = str(record.seq).lower()
ns = seq.count('n')
nCount.append(ns)
lengths.append(len(seq))
ungapped_lengths.append(len(seq) - ns)
faFile.close()
print 'Number of scaffolds: 133,035'
print 'Number of unplaced contigs: 259,343'
print "scaffolds"
faStats('/home/share/projects/phaw_genome/draft_3.0/assembly/phaw_30.scf.fa')
print "unplaced contigs (degenerate contigs)"
faStats('/home/share/projects/phaw_genome/draft_3.0/assembly/phaw_30.degen.fa')
print '15,160 Scaffolds with gene predictions'
faStats('/home/share/projects/phaw_genome/draft_3.0/assembly/phaw_30.genic.fa')
print 'removed contigs'
faStats('/home/share/projects/phaw_genome/draft_3.0/assembly/contig/k120_k70.gam.format.cdhit.collapsed.fa')
There are 1,488 proteins representing 248 genes in the CEGMA database. I blasted the transcriptome and genome against the 1,488 proteins. For each of the found 248 genes, I plotted the coverage across the CEGMA protein and e-value.
The transcriptome shows good coverage for 247/248 CEGMA genes. The genome shows moderate coverage for 244/248 of the CEGMA genes reflecting the high level of fragmentation in the genome.
def tileCoords(coords):
formatCoords = []
for coord in coords:
coord = [int(coord[0]),int(coord[1])]
formatCoords.append(('s',min(coord)))
formatCoords.append(('e',max(coord)))
formatCoords.sort(key = lambda x : x[0], reverse = True)
formatCoords.sort(key = lambda x : x[1])
count = 0
posA = 0
out = []
for pos in formatCoords:
if count == 0:
posA = pos[1]
if pos[0] == 's':
count += 1
if pos[0] == 'e':
count -=1
if count == 0:
out.append((posA, pos[1]))
return out
def cegmaStat(path):
cegmaFile = open(path)
hits = defaultdict(list)
for line in cegmaFile:
meta = line.strip().split()
k = (meta[0],meta[1])
hits[k].append([int(meta[6]),int(meta[7]),float(meta[-1]),int(meta[2])])
kogs = defaultdict(list)
for hit, data in hits.items():
tCoords = tileCoords(data)
evalue = min([x[2] for x in data])
klength = data[0][3]
hitLength = sum([x[1] - x[0] + 1 for x in tCoords])
kogs[hit[0]].append((hitLength / float(klength), evalue, hit[1]))
bestKog = defaultdict(lambda : (0,0,0))
for kog, data in kogs.items():
data.sort(key = lambda x : x[0], reverse = True)
kid = kog.split('_')[-1]
if bestKog[kid][0] < data[0][0]:
bestKog[kid] = (data[0][0], data[0][1], data[0][2])
return bestKog
transCegma = cegmaStat('/home/share/projects/phaw_genome/draft_3.0/transcriptome/cegma/phaw_AP.cegma')
df_transCegma = []
for kog, data in transCegma.items():
df_transCegma.append([kog, data[0],data[1],data[2]])
df_transCegma = pd.DataFrame(df_transCegma)
df_transCegma.columns = ['KogID','coverage','evalue','tid']
df_transCegma.plot(kind='scatter',x='coverage',y='evalue',logy=True,fontsize=50,figsize=[10,6])
genomeCegma = cegmaStat('/home/share/projects/phaw_genome/draft_3.0/assembly/cegma/phaw_30.variantctg.cegma')
df_genomeCegma = []
for kog, data in genomeCegma.items():
df_genomeCegma.append([kog, data[0],data[1],data[2]])
df_genomeCegma = pd.DataFrame(df_genomeCegma)
df_genomeCegma.columns = ['KogID','coverage','evalue','tid']
df_genomeCegma.plot(kind='scatter',x='coverage',y='evalue',logy=True,figsize=[10,6])