src/bin/whamg.cpp
#include <string>
#include <iostream>
#include <math.h>
#include <cmath>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <unistd.h>
#include "split.h"
#include "join.hpp"
#include "cigar.hpp"
#include "stats.hpp"
#include "dataStructures.hpp"
#include "fastahack/Fasta.h"
// openMP - swing that hammer
#include <omp.h>
// bamtools and my headers
#include "api/BamMultiReader.h"
#include "readPileUp.h"
// phred scaling
#include "phredUtils.h"
using namespace std;
using namespace BamTools;
typedef map< char, map< int, map< int, breakpoint * > > > unpairedSVs;
typedef map< int, map< int, breakpoint * > > unpairedSVsChr;
typedef map< int, breakpoint * > unpairedSVsTypePos;
struct options{
std::vector<string> targetBams;
bool statsOnly ;
bool skipGeno ;
bool keepTrying ;
bool noInterSeqid ;
int MQ ;
int NM ;
int nthreads ;
int lastSeqid ;
int minPairMatch ;
string fasta ;
string graphOut ;
map<string, int> toSkip ;
map<string, int> toInclude ;
string seqid ;
vector<int> region ;
string svs ;
map<string,string> SMTAGS ;
string saT ;
}globalOpts;
//GLOBAL STRUCTS
// the forest of graphs
graph globalGraph;
// library stats (insertsize ...)
libraryStats insertDists;
// global read pair store
map<string, readPair*> globalPairStore;
// seqid->int index
map<string, int> inverse_lookup;
//int->seqid index
map<int, string> forward_lookup;
// options
static const char *optString = "c:i:u:m:r:a:g:x:f:e:d:hsz";
// omp lock
omp_lock_t lock;
// omp lock for the graph
omp_lock_t glock;
// omp lock for the pairstor
omp_lock_t pslock;
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : nothing
Function does : prints help
Function returns: NA
*/
void printVersion(void){
cerr << "Version: " << VERSION << endl;
cerr << "Contact: zev.kronenberg [at] gmail.com " << endl;
cerr << "Notes : -If you find a bug, please open a report on github!" << endl;
cerr << endl;
}
void printHelp(void){
//------------------------------- XXXXXXXXXX --------------------------------
cerr << " Basic usage: " << endl;
cerr << " whamg -f my.bam -a my.fasta \\ " << endl;
cerr << " -e M,GL000207.1 2> wham.err > wham.vcf"
<< endl;
cerr << endl;
cerr << " Required: " << endl;
//------------------------------- XXXXXXXXXX --------------------------------
cerr << " -f <STRING> Comma separated list of bam files or a file with " << endl;
cerr << " one bam (full path) per line. " << endl;
cerr << " -a <STRING> The reference genome (indexed fasta). " << endl;
cerr << endl;
cerr << " Optional: Recommended flags are noted with : * " << endl;
cerr << " -s <FLAG> Exits the program after the stats are " << endl;
cerr << " gathered. [false] " << endl;
cerr << " -g <STRING> File to write graph to (very large output). [false] " << endl;
cerr << " *|-c -e <STRING> Comma sep. list of seqids to skip [false]. " << endl;
cerr << " *|-e -c <STRING> Comma sep. list of seqids to keep [false]. " << endl;
cerr << " -r <STRING> Region in format: seqid:start-end [whole genome] " << endl;
cerr << " * -x <INT> Number of CPUs to use [1 CPU]. " << endl;
cerr << " -m <INT> Mapping quality filter [20]. " << endl;
cerr << " -i <STRING> non standard split read tag [SA] " << endl;
cerr << " -z <FLAG> Sample reads until success. [false] " << endl;
cerr << " -d <INT> Minimum number of matching bases (both reads).[100] " << endl;
cerr << endl;
cerr << " Output: " << endl;
cerr << " STDERR: Run statistics and bam stats " << endl;
cerr << " STOUT : SV calls in VCF " << endl;
cerr << endl;
cerr << " Details: " << endl;
cerr << " -z <FLAG> WHAM-GRAPHENING can fail if does not sample " << endl;
cerr << " enough reads. This flag prevents whamg " << endl;
cerr << " from exiting. If your bam header has seqids not in " << endl;
cerr << " the bam (e.g. split by region) use -z. " << endl;
cerr << " -i <STRING> WHAM-GRAPHENING uses the optional bwa-mem SA tag. " << endl;
cerr << " Older version of bwa-mem used XP. " << endl;
cerr << " -e|-c <STRING> A list of seqids to include or exclude while " << endl;
cerr << " sampling insert and depth. For humans you should " << endl;
cerr << " use the standard chromosomes 1,2,3...X,Y. " << endl;
cerr << endl;
printVersion();
}
//------------------------------- SUBROUTINE --------------------------------
bool breakSort(breakpoint * L, breakpoint * R){
if(L->IsMasked() || R->IsMasked()){
return false;
}
if(L->nodeL->seqid == R->nodeL->seqid){
if(L->nodeL->pos <= R->nodeL->pos){
return true;
}
else{
return false;
}
}
else{
if(L->nodeL->seqid < R->nodeL->seqid){
return true;
}
else{
return false;
}
}
return false;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : NA
Function does : prints vcf header
Function returns: NA
*/
void printVCF(std::vector<breakpoint*> & bp){
std::vector<breakpoint*> scrubbed;
for(std::vector<breakpoint*>::iterator it = bp.begin();
it != bp.end(); it++){
if((*it)->IsMasked()){
continue;
}
if(!(*it)->IsPrint()){
continue;
}
if((*it)->nodeL->seqid != (*it)->nodeR->seqid){
continue;
}
if((*it)->getType() == 'T'){
continue;
}
if((*it)->getTotalSupport() < 3){
continue;
}
scrubbed.push_back(*it);
}
sort(scrubbed.begin(), scrubbed.end(), breakSort);
stringstream header;
header << "##fileformat=VCFv4.2" << std::endl;
header << "##source=WHAM-GRAPHENING:" << VERSION << std::endl;
header << "##reference=" << globalOpts.fasta << std::endl;
header << "##INFO=<ID=A,Number=1,Type=Integer,Description=\"Total pieces of evidence\">" << std::endl;
header << "##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">" << std::endl;
header << "##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">" << std::endl;
header << "##INFO=<ID=CF,Number=1,Type=Float,Description=\"Fraction of reads in graph that cluster with SVTYPE pattern\">" << std::endl;
header << "##INFO=<ID=CW,Number=5,Type=Float,Description=\"SVTYPE weight 0-1; DEL,DUP,INV,INS,BND\">" << std::endl;
header << "##INFO=<ID=D,Number=1,Type=Integer,Description=\"Number of reads supporting a deletion\">" << std::endl;
header << "##INFO=<ID=DI,Number=1,Type=Float,Description=\"Average distance of mates to breakpoint\">" << std::endl;
header << "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">" << std::endl;
header << "##INFO=<ID=EV,Number=1,Type=Integer,Description=\"Number everted mate-pairs\">" << std::endl;
header << "##INFO=<ID=I,Number=1,Type=Integer,Description=\"Number of reads supporting an insertion\">" << endl;
header << "##INFO=<ID=SR,Number=1,Type=Integer,Description=\"Number of split-reads supporing SV\">" << std::endl;
header << "##INFO=<ID=SS,Number=1,Type=Integer,Description=\"Number of split-reads supporing SV\">" << std::endl;
header << "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">" << std::endl;
header << "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">" << std::endl;
header << "##INFO=<ID=T,Number=1,Type=Integer,Description=\"Number of reads supporting a BND\">" << std::endl;
header << "##INFO=<ID=TAGS,Number=.,Type=String,Description=\"SM tags with breakpoint support\">" << std::endl;
header << "##INFO=<ID=TF,Number=1,Type=Integer,Description=\"Number of reads mapped too far\">" << std::endl;
header << "##INFO=<ID=U,Number=1,Type=Integer,Description=\"Number of reads supporting a duplication\">" << std::endl;
header << "##INFO=<ID=V,Number=1,Type=Integer,Description=\"Number of reads supporting an inversion\">" << std::endl;
header << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << std::endl;
header << "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">" << std::endl;
header << "##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Per sample SV support\">" << std::endl;
header << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" ;
for(vector<string>::iterator iz = globalOpts.targetBams.begin();
iz != globalOpts.targetBams.end(); iz++){
if(globalOpts.SMTAGS.find(*iz) == globalOpts.SMTAGS.end()){
cerr << "FATAL: could not find SM tag for: " << *iz << endl;
exit(1);
}
header << "\t" << globalOpts.SMTAGS[*iz];
}
cout << header.str() << endl;
for(std::vector<breakpoint*>::iterator it = scrubbed.begin();
it != scrubbed.end(); it++){
if((*it)->IsMasked()){
continue;
}
if((*it)->getType() == 'T'){
continue;
}
else{
(*it)->loadSMSupport();
std::cout << **it << "\tGT:DP:SP";
for(vector<string>::iterator iz = globalOpts.targetBams.begin();
iz != globalOpts.targetBams.end(); iz++){
std::cout << "\t.:.:" << (*it)->getSMSupport(globalOpts.SMTAGS[*iz]);
}
std::cout << std::endl;
}
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : node pointer, vector of node pointers
Function does : populates a vector for a sub graph
Function returns: void
*/
void getTree(node * n, vector<node *> & ns){
map<edge *, int> seenEdges ;
map<node *, int> seenNodes ;
vector<edge *> edges ;
edges.insert(edges.end(), n->eds.begin(), n->eds.end());
/* if something is pushed to the back of the vector it changes the
positions ! be warned. */
while(!edges.empty()){
edge * last = edges.back();
edges.pop_back();
for(std::vector<edge *>::iterator it = last->L->eds.begin();
it != last->L->eds.end(); it++){
if(seenEdges.find(*it) == seenEdges.end()){
edges.push_back(*it);
seenEdges[*it] = 1;
seenNodes[(*it)->L] = 1;
seenNodes[(*it)->R] = 1;
}
}
for(std::vector<edge *>::iterator it = last->R->eds.begin();
it != last->R->eds.end(); it++){
if(seenEdges.find(*it) == seenEdges.end()){
edges.push_back(*it);
seenEdges[*it] = 1;
seenNodes[(*it)->L] = 1;
seenNodes[(*it)->R] = 1;
}
}
}
for(std::map<node *, int>::iterator it = seenNodes.begin();
it != seenNodes.end(); it++){
ns.push_back(it->first);
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : read pair pointer
Function does : returns true is reads are everted
Function returns: bool
*/
inline bool isEverted(readPair * rp){
if(!rp->al1.IsMapped() || !rp->al2.IsMapped()){
return false;
}
if(rp->al1.RefID != rp->al2.RefID){
return false;
}
if(rp->al1.Position <= rp->al2.Position){
if(rp->al1.IsReverseStrand()
&& (!rp->al2.IsReverseStrand()) ){
return true;
}
}
else{
if(rp->al2.IsReverseStrand()
&& (!rp->al1.IsReverseStrand()) ){
return true;
}
}
return false;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : left position, right position, local graph
Function does : adds nodes or builds connections
Function returns: void
*/
void addIndelToGraph(int refIDL,
int refIDR,
int l,
int r,
char s,
string & SM){
omp_set_lock(&glock);
if( ! isInGraph(refIDL, l, globalGraph)
&& ! isInGraph(refIDR, r, globalGraph) ){
node * nodeL;
node * nodeR;
edge * ed ;
nodeL = new node;
nodeR = new node;
ed = new edge;
nodeL->sm[SM] = 1;
nodeR->sm[SM] = 1;
initEdge(ed);
ed->support[s] +=1;
ed->L = nodeL;
ed->R = nodeR;
nodeL->eds.push_back(ed);
nodeR->eds.push_back(ed);
nodeL->pos = l;
nodeL->seqid = refIDL;
nodeR->pos = r;
nodeR->seqid = refIDR;
globalGraph.nodes[refIDL][l] = nodeL;
globalGraph.nodes[refIDR][r] = nodeR;
}
else if(isInGraph(refIDL, l, globalGraph)
&& ! isInGraph(refIDR, r, globalGraph)){
node * nodeR;
edge * ed;
nodeR = new node;
ed = new edge;
nodeR->sm[SM] = 1;
initEdge(ed);
ed->support[s] += 1;
nodeR->pos = r;
nodeR->seqid = refIDR;
ed->L = globalGraph.nodes[refIDL][l];
ed->R = nodeR;
nodeR->eds.push_back(ed);
globalGraph.nodes[refIDL][l]->eds.push_back(ed);
globalGraph.nodes[refIDR][r] = nodeR;
if(globalGraph.nodes[refIDL][l]->sm.find(SM) ==
globalGraph.nodes[refIDL][l]->sm.end()){
globalGraph.nodes[refIDL][l]->sm[SM] = 1;
}
else{
globalGraph.nodes[refIDL][l]->sm[SM] += 1;
}
}
else if(! isInGraph(refIDL, l, globalGraph)
&& isInGraph(refIDR, r, globalGraph)){
node * nodeL;
edge * ed;
nodeL = new node;
ed = new edge;
initEdge(ed);
ed->support[s] += 1;
nodeL->pos = l;
nodeL->seqid = refIDL;
ed->R = globalGraph.nodes[refIDR][r];
ed->L = nodeL;
nodeL->eds.push_back(ed);
globalGraph.nodes[refIDR][r]->eds.push_back(ed);
globalGraph.nodes[refIDL][l] = nodeL;
if(globalGraph.nodes[refIDR][r]->sm.find(SM) ==
globalGraph.nodes[refIDR][r]->sm.end()){
globalGraph.nodes[refIDR][r]->sm[SM] = 1;
}
else{
globalGraph.nodes[refIDR][r]->sm[SM] += 1;
}
}
else{
uint hit = 0;
if(globalGraph.nodes[refIDL][l]->sm.find(SM)
== globalGraph.nodes[refIDL][l]->sm.end()){
globalGraph.nodes[refIDL][l]->sm[SM] = 1;
}
else{
globalGraph.nodes[refIDL][l]->sm[SM] += 1;
}
if(globalGraph.nodes[refIDR][r]->sm.find(SM)
== globalGraph.nodes[refIDR][r]->sm.end()){
globalGraph.nodes[refIDR][r]->sm[SM] = 1;
}
else{
globalGraph.nodes[refIDR][r]->sm[SM] += 1;
}
for(vector<edge *>::iterator ite
= globalGraph.nodes[refIDL][l]->eds.begin();
ite != globalGraph.nodes[refIDL][l]->eds.end(); ite++){
if((*ite)->L->pos == l && (*ite)->R->pos == r){
(*ite)->support[s] += 1;
hit = 1;
}
}
if(hit == 0){
edge * ne;
ne = new edge;
initEdge(ne);
ne->support[s]+=1;
ne->L = globalGraph.nodes[refIDL][l];
ne->R = globalGraph.nodes[refIDR][r];
globalGraph.nodes[refIDL][l]->eds.push_back(ne);
globalGraph.nodes[refIDR][r]->eds.push_back(ne);
}
}
omp_unset_lock(&glock);
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : bam alignment, sm tag
Function does : finds the positions of indels
Function returns: string
*/
bool indelToGraph(BamAlignment & ba, string & SM){
if(ba.MapQuality < 30){
return false;
}
bool hit = false;
int p = ba.Position;
for(vector<CigarOp>::iterator ci = ba.CigarData.begin();
ci != ba.CigarData.end(); ci++){
// 20bp seems reasonable
if(ci->Length < 20){
continue;
}
switch(ci->Type){
case 'M':
{
p += ci->Length;
break;
}
case '=':
{
p += ci->Length;
break;
}
case 'N':
{
p += ci->Length;
break;
}
case 'X':
{
p += ci->Length;
break;
}
case 'I':
{
hit = true;
addIndelToGraph(ba.RefID, ba.RefID, p, (p + ci->Length), 'I', SM);
break;
}
case 'D':
{
hit = true;
addIndelToGraph(ba.RefID, ba.RefID, p, (p + ci->Length), 'D', SM);
p += ci->Length;
break;
}
default :
{
break;
}
}
}
return hit;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : read pair to filter
Function does : fails poor quality or non-sv read pairs
Function returns: true = bad, false = good;
*/
inline bool pairFailed(readPair * rp){
if(! rp->al1.IsMapped() || ! rp->al2.IsMapped() ){
return true;
}
if(rp->al1.MapQuality < 5
|| rp->al2.MapQuality < 5 ){
return true;
}
if(rp->al1.MapQuality < globalOpts.MQ
&& rp->al2.MapQuality < globalOpts.MQ){
return true;
}
if(rp->al1.Length == rp->al1.CigarData[0].Length
&& rp->al1.CigarData[0].Type == 'M' &&
rp->al2.Length == rp->al2.CigarData[0].Length
&& rp->al2.CigarData[0].Type == 'M' ){
return true;
}
if((match(rp->al1.CigarData) + match(rp->al2.CigarData)) < globalOpts.minPairMatch){
return true;
}
return false;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : bam alignment, vector<saTag>
Function does : finds links for splitter to put in graph
Function returns: NA
*/
void splitToGraph(BamAlignment & al,
vector<saTag> & sa,
string & SM ){
// too many chimeras ... for me
if(sa.size() > 1){
return;
}
// too many mismatches
if(sa.front().NM > globalOpts.NM){
return;
}
// map quality of zero...
if(sa.front().mapQ < 5){
return;
}
// split read is trimmed on both side skip
if(sa.front().cig.front().Type == 'S'
&& sa.front().cig.back().Type == 'S'){
return;
}
// alignment is trimmed on both sides skip
if(al.CigarData.front().Type == 'S'
&& al.CigarData.back().Type == 'S'){
return;
}
char support = 'S';
// split reads are on different strands
if((sa[0].strand && ! al.IsReverseStrand())
|| (! sa[0].strand && al.IsReverseStrand() )){
support = 'V';
}
int start = al.Position ;
int end = sa.front().pos ;
/* since both sides are not clipped if the back is clipped we know
the front is not */
bool readFront = true;
if(al.CigarData.back().Type == 'S'){
start = al.GetEndPosition(false,true);
readFront = false;
}
/* we also know that both sides of the split read are not trimmed */
if(sa.front().cig.back().Type == 'S'){
endPos(sa[0].cig, &end) ;
}
if(readFront){
if(end > start){
support = 'Z';
}
}
else{
if(end < start){
support = 'Z';
}
}
addIndelToGraph(al.RefID, sa[0].seqid, start, end, support, SM);
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : pointer to readPair;
Function does : adds high and low insert sizes to graph. both pairs are
treated as mapped
Function returns: NA
*/
bool deviantInsertSize(readPair * rp, char supportType, string & SM){
//can't be deviant on different seqids
if(rp->al1.RefID != rp->al2.RefID){
return false;
}
// both reads are clipped, skip
if(! IsLongClip(rp->al1.CigarData, 1)
&& ! IsLongClip(rp->al2.CigarData, 1)){
return false;
}
int start = rp->al1.Position;
int end = rp->al2.Position;
if(rp->al1.CigarData.front().Type == 'S'
|| rp->al1.CigarData.back().Type == 'S'){
if(rp->al2.CigarData.back().Type == 'S'){
end = rp->al2.GetEndPosition(false,true);
}
if(rp->al1.CigarData.back().Type == 'S'){
start = rp->al1.GetEndPosition(false,true);
}
}
else{
if(rp->al1.CigarData.back().Type == 'S'){
end = rp->al1.GetEndPosition(false,true);
}
if(rp->al2.CigarData.back().Type == 'S'){
start = rp->al2.GetEndPosition(false,true);
}
}
addIndelToGraph(rp->al2.RefID, rp->al2.RefID, start, end, supportType, SM);
return true;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : read pair pointer
Function does : -->s s<-- tests if pairs suggests insertion
Function returns: bool
*/
inline bool isPointIn(readPair * rp){
if(!rp->al1.IsMapped() || !rp->al2.IsMapped()){
return false;
}
if(rp->al1.RefID != rp->al2.RefID){
return false;
}
if(rp->al1.Position <= rp->al2.Position){
if(rp->al1.CigarData.back().Type == 'S' &&
rp->al2.CigarData.front().Type == 'S'){
return true;
}
}
else{
if(rp->al1.CigarData.front().Type == 'S' &&
rp->al2.CigarData.back().Type == 'S'){
return true;
}
}
return false;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : pointer to readPair
Function does : calulates if overlap is too much
Function returns: bool
*/
inline bool tooMuchOverlap(readPair * rp){
if(rp->al1.RefID != rp->al2.RefID){
return false;
}
if(rp->al1.Position > rp->al2.GetEndPosition(false,true) ||
rp->al2.Position > rp->al1.GetEndPosition(false,true)){
return false;
}
long int maxStart = std::max(rp->al1.Position, rp->al2.Position);
long int minEnd = std::min(rp->al1.GetEndPosition(false,true),
rp->al1.GetEndPosition(false,true));
double perOver = double(minEnd - maxStart)
/ double(rp->al1.Length + rp->al2.Length);
if(perOver > 0.2){
return true;
}
return false;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : pointer to readPair ; seqid to index
Function does : processes Pair
Function returns: NA
*/
void processPair(readPair * rp,
double * low,
double * high,
string & SM){
if(globalOpts.noInterSeqid && rp->al1.RefID != rp->al2.RefID){
return;
}
if(pairFailed(rp)){
return;
}
if(isPointIn(rp)){
return;
}
if(tooMuchOverlap(rp)){
return;
}
string sa1;
string sa2;
bool sameStrand = false;
bool everted = isEverted(rp);
string xa;
std::vector<std::string> xas;
if(rp->al1.GetTag("XA", xa)){
xas = split(xa, ";");
if(xas.size() > 3){
return;
}
}
if(rp->al2.GetTag("XA", xa)){
xas = split(xa, ";");
if(xas.size() > 3){
return;
}
}
if(rp->al1.RefID == rp->al2.RefID){
indelToGraph(rp->al1, SM);
indelToGraph(rp->al2, SM);
}
// nm filter
std::string nm;
if(rp->al1.GetTag("NM", nm)){
if(atoi(nm.c_str()) > globalOpts.NM){
return;
}
}
if(rp->al2.GetTag("NM", nm)){
if(atoi(nm.c_str()) > globalOpts.NM){
return;
}
}
if( ! IsLongClip(rp->al1.CigarData, 5)
&& ! IsLongClip(rp->al2.CigarData, 5)){
return;
}
if((rp->al1.IsReverseStrand() && rp->al2.IsReverseStrand())
|| (! rp->al1.IsReverseStrand() && ! rp->al2.IsReverseStrand()) ){
sameStrand = true;
}
if(! everted){
if( abs(rp->al1.InsertSize) > *high){
if(sameStrand){
deviantInsertSize(rp, 'M', SM);
}
else{
deviantInsertSize(rp, 'H', SM);
}
}
else if(abs(rp->al1.InsertSize) < *low ){
if(sameStrand){
deviantInsertSize(rp, 'R', SM);
}
else{
deviantInsertSize(rp, 'L', SM);
}
}
else{
if(sameStrand){
deviantInsertSize(rp, 'A', SM);
}
}
}
else{
deviantInsertSize(rp, 'X', SM);
}
// put the split reads in the graph
if(rp->al1.GetTag( globalOpts.saT, sa1)){
vector<saTag> parsedSa1;
parseSA(parsedSa1, sa1, globalOpts.saT, inverse_lookup);
splitToGraph(rp->al1, parsedSa1, SM);
}
if(rp->al2.GetTag( globalOpts.saT, sa2)){
vector<saTag> parsedSa2;
parseSA(parsedSa2, sa2, globalOpts.saT, inverse_lookup);
splitToGraph(rp->al2, parsedSa2, SM);
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : filename, seqid, start, end, refSeqs (bamtools object),
paired end store.
Function does : Matches paired ends so they can be processed together.
The global pair store is loaded for the missing mates
Function returns: NA
*/
bool runRegion(string filename,
int seqidIndex,
int start,
int end,
vector< RefData > seqNames,
string & SM){
// local read pair store
map<string, readPair *>pairStoreLocal;
double high = insertDists.upr[filename] ;
double low = insertDists.low[filename] ;
BamReader br;
br.Open(filename);
if(! br.LocateIndex()){
vector<string> fileName = split(filename, ".");
fileName.back() = "bai";
string indexName = join(fileName, ".");
if(! br.OpenIndex(indexName) ){
cerr << "FATAL: cannot find bam index." << endl;
}
}
if(!br.SetRegion(seqidIndex, start, seqidIndex, end)){
return false;
}
BamAlignment al;
while(br.GetNextAlignmentCore(al)){
if((al.AlignmentFlag & 0x0800) != 0 ){
continue;
}
if((al.AlignmentFlag & 0x0100) != 0 ){
continue;
}
if(! al.IsPaired()){
continue;
}
if(! al.IsMapped() && ! al.IsMateMapped()){
continue;
}
if(al.IsDuplicate()){
continue;
}
if(! al.IsPrimaryAlignment()){
continue;
}
// dna and read name
al.BuildCharData();
if(pairStoreLocal.find(al.Name) != pairStoreLocal.end()){
pairStoreLocal[al.Name]->count += 1;
if(pairStoreLocal[al.Name]->flag == 2){
pairStoreLocal[al.Name]->al1 = al;
}
else{
pairStoreLocal[al.Name]->al2 = al;
}
processPair(pairStoreLocal[al.Name], &low, &high, SM);
delete pairStoreLocal[al.Name];
pairStoreLocal.erase(al.Name);
}
else{
readPair * rp;
rp = new readPair;
rp->count = 1;
if(al.IsFirstMate()){
rp->flag = 1 ;
rp->al1 = al;
}
else{
rp->flag = 2 ;
rp->al2 = al;
}
pairStoreLocal[al.Name] = rp;
}
}
// close the bam
br.Close();
// load lonely reads into the global struct;
// if it finds a mate in the global store it processes and deletes
omp_set_lock(&pslock);
for(map<string, readPair *>::iterator rps = pairStoreLocal.begin();
rps != pairStoreLocal.end(); rps++){
if(globalPairStore.find(rps->first) != globalPairStore.end()){
globalPairStore[rps->first]->count += 1;
if(globalPairStore[rps->first]->flag == 1 && (*rps->second).flag == 1){
continue;
}
if(globalPairStore[rps->first]->flag == 2 && (*rps->second).flag == 2){
continue;
}
if(globalPairStore[rps->first]->flag == 1){
(*globalPairStore[rps->first]).al2 = (*rps->second).al2;
}
else{
(*globalPairStore[rps->first]).al1 = (*rps->second).al1;
}
processPair(globalPairStore[rps->first], &low, &high, SM);
delete globalPairStore[rps->first];
delete pairStoreLocal[rps->first];
globalPairStore.erase(rps->first);
}
else{
globalPairStore[rps->first] = pairStoreLocal[rps->first];
}
}
omp_unset_lock(&pslock);
return true;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : string
Function does : reads bam into graph
Function returns: NA
*/
void loadBam(string & bamFile){
bool region = false;
int start = 0;
int end = 0;
string regionSID;
omp_set_lock(&lock);
if(!globalOpts.seqid.empty()){
region = true;
regionSID = globalOpts.seqid;
start = globalOpts.region.front();
end = globalOpts.region.back();
}
omp_unset_lock(&lock);
BamReader br;
if(! br.Open(bamFile)){
cerr << "\n" << "FATAL: could not open: " << bamFile << endl;
exit(1);
}
if(! br.LocateIndex()){
vector<string> fileName = split(bamFile, ".");
fileName.back() = "bai";
string indexName = join(fileName, ".");
cerr << "INFO: Did not find *bam.bai" << endl;
cerr << "INFO: Trying: " << indexName << endl;
if(! br.OpenIndex(indexName) ){
cerr << "FATAL: cannot find bam index." << endl;
}
}
// grabbing the header
SamHeader SH = br.GetHeader();
if(!SH.HasReadGroups()){
cerr << endl;
cerr << "FATAL: No @RG detected in header. WHAM uses \"SM:sample\"." << endl;
cerr << endl;
}
SamReadGroupDictionary RG = SH.ReadGroups;
string SM;
if(!RG.Begin()->HasSample()){
cerr << endl;
cerr << "FATAL: No SM tag in bam file." << endl;
exit(1);
cerr << endl;
}
SM = RG.Begin()->Sample;
// if the bam is not sorted die
if(!SH.HasSortOrder()){
cerr << "FATAL: sorted bams must have the @HD SO: tag in each SAM header: " << bamFile << endl;
exit(1);
}
RefVector sequences = br.GetReferenceData();
//chunking up the genome
vector< regionDat* > regions;
int seqidIndex = 0;
if(region){
int p = start;
int e = 0;
for(; (p+1000000) <= end; p += 1000000){
regionDat * regionInfo = new regionDat;
regionInfo->seqidIndex = inverse_lookup[regionSID];
regionInfo->start = p ;
regionInfo->end = 1000000 + p ;
regions.push_back(regionInfo);
e = p + 1000000;
}
if(e < end){
regionDat * regionInfo = new regionDat;
regionInfo->seqidIndex = inverse_lookup[regionSID];
regionInfo->start = p ;
regionInfo->end = end ;
regions.push_back(regionInfo);
}
}
else{
seqidIndex = 0;
for(vector< RefData >::iterator sit = sequences.begin();
sit != sequences.end(); sit++){
int start = 0;
if(globalOpts.toSkip.find( (*sit).RefName )
== globalOpts.toSkip.end() && ((*sit).RefLength > 1000)){
for(;start < (*sit).RefLength ; start += 1000000){
regionDat * chunk = new regionDat;
chunk->seqidIndex = seqidIndex;
chunk->start = start;
chunk->end = start + 1000000 ;
regions.push_back(chunk);
}
regionDat * lastChunk = new regionDat;
lastChunk->seqidIndex = seqidIndex;
lastChunk->start = start;
lastChunk->end = (*sit).RefLength;
seqidIndex += 1;
if(start < (*sit).RefLength){
regions.push_back(lastChunk);
}
}
else{
seqidIndex += 1;
}
}
}
// closing the bam reader before running regions
br.Close();
int Mb = 0;
// running the regions with openMP
#pragma omp parallel for schedule(dynamic, 3)
for(unsigned int re = 0; re < regions.size(); re++){
if(! runRegion(bamFile ,
regions[re]->seqidIndex,
regions[re]->start ,
regions[re]->end ,
sequences ,
SM )){
omp_set_lock(&lock);
cerr << "WARNING: region failed to run properly: "
<< sequences[regions[re]->seqidIndex].RefName
<< ":" << regions[re]->start << "-"
<< regions[re]->end
<< endl;
omp_unset_lock(&lock);
}
else{
delete regions[re];
omp_set_lock(&lock);
Mb += 1;
if((Mb % 100) == 0 ){
cerr << "INFO: " << SM
<< ": processed "
<< Mb << "Mb of the genome." << endl;
}
omp_unset_lock(&lock);
}
}
cerr << "INFO: " << bamFile << " had "
<< globalPairStore.size()
<< " reads that were not processed"
<< endl;
// cleanup remaining reads
omp_set_lock(&pslock);
for(map<string, readPair*>::iterator rps = globalPairStore.begin();
rps != globalPairStore.end(); rps++){
delete rps->second;
}
globalPairStore.clear();
omp_unset_lock(&pslock);
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : string
Function does : loads filenames for bam files
Function returns: bool
*/
bool loadTargetBamFileNames(std::string fns){
std::string line;
std::vector<std::string> ext = split(fns, ".");
if(ext.back() == "bam"){
globalOpts.targetBams = split(fns, ",");
}
else{
ifstream myfile(fns.c_str());
if(myfile){
while (getline( myfile, line )){
globalOpts.targetBams.push_back(line);
}
}
}
return true;
}
//------------------------------- OPTIONS --------------------------------
int parseOpts(int argc, char** argv)
{
int opt = 0;
opt = getopt(argc, argv, optString);
while(opt != -1){
switch(opt){
case 'd':
{
globalOpts.minPairMatch = atoi(((string)optarg).c_str());
cerr << "INFO: whamg will keep read pairs with " << globalOpts.minPairMatch << " matching bases." << endl;
break;
}
case 'i':
{
globalOpts.saT = optarg;
if(globalOpts.saT != "XP" && globalOpts.saT != "XP"){
cerr << "FATAL: only SA and XP optional tags are supported for split reads" << endl;
exit(1);
}
cerr << "INFO: You are using a non standard split-read tag: " << globalOpts.saT << endl;
break;
}
case 'u':
{
cerr << "INFO: You are using a hidden flag." << endl;
globalOpts.lastSeqid = atoi(((string)optarg).c_str());
cerr << "INFO: Random sampling will only go up to: " << globalOpts.lastSeqid << endl;
break;
}
case 'z':
{
globalOpts.keepTrying = true;
cerr << "INFO: WHAM-GRAPHENING will not give up sampling reads: -z set" << globalOpts.svs << endl;
break;
}
case 's':
{
globalOpts.statsOnly = true;
break;
}
case 'g':
{
globalOpts.graphOut = optarg;
cerr << "INFO: graphs will be written to: " << globalOpts.graphOut
<< endl;
break;
}
case 'a':
{
globalOpts.fasta = optarg;
cerr << "INFO: fasta file: " << globalOpts.fasta << endl;
break;
}
case 'e':
{
vector<string> seqidsToSkip = split(optarg, ",");
cerr << "INFO: WHAM will skip seqid: " << optarg << endl;
for(unsigned int i = 0; i < seqidsToSkip.size(); i++){
globalOpts.toSkip[seqidsToSkip[i]] = 1;
}
break;
}
case 'c':
{
vector<string> seqidsToInclude = split(optarg, ",");
for(unsigned int i = 0; i < seqidsToInclude.size(); i++){
globalOpts.toInclude[seqidsToInclude[i]] = 1;
cerr << "INFO: WHAM will analyze seqid: " << seqidsToInclude[i] << endl;
}
break;
}
case 'f':
{
loadTargetBamFileNames(string(optarg));
cerr << "INFO: target bams:\n" << joinReturn(globalOpts.targetBams) ;
break;
}
case 'h':
{
printHelp();
exit(1);
break;
}
case '?':
{
break;
}
case 'm':
{
globalOpts.MQ = atoi(((string)optarg).c_str());
cerr << "INFO: Reads with mapping quality below " << globalOpts.MQ << " will be filtered. " << endl;
break;
}
case 'x':
{
globalOpts.nthreads = atoi(((string)optarg).c_str());
cerr << "INFO: OpenMP will roughly use " << globalOpts.nthreads
<< " threads" << endl;
break;
}
case 'r':
{
vector<string> tmp_region = split(optarg, ":");
if(tmp_region.size() != 2 || tmp_region[1].empty() || tmp_region[0].empty()){
cerr << "FATAL: region was not set correctly" << endl;
cerr << "INFO: region format: seqid:start-end" << endl;
exit(1);
}
vector<string> start_end = split(tmp_region[1], "-");
globalOpts.seqid = tmp_region[0];
globalOpts.region.push_back(atoi(start_end[0].c_str()));
globalOpts.region.push_back(atoi(start_end[1].c_str()));
if(start_end.size() !=2 || start_end[0].empty() || start_end[1].empty()){
cerr << "FATAL: region was not set correctly" << endl;
cerr << "INFO: region format: seqid:start-end" << endl;
exit(1);
}
cerr << "INFO: region set to: " << globalOpts.seqid << ":" << globalOpts.region[0] << "-" << globalOpts.region[1] << endl;
if(globalOpts.region.size() != 2){
cerr << "FATAL: incorrectly formatted region." << endl;
cerr << "FATAL: wham is now exiting." << endl;
exit(1);
}
break;
}
default:
{
cerr << "FATAL: unknown command line option" << endl;
exit(1);
}
}
opt = getopt( argc, argv, optString );
}
return 1;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : vector<nodes *>
Function does : dumps a graph in dot format
Function returns: string
*/
string dotviz(vector<node *> & ns){
stringstream ss;
ss << "graph {\n";
for(vector<node *>::iterator it = ns.begin();
it != ns.end(); it++){
for(vector<edge *>:: iterator iz = (*it)->eds.begin();
iz != (*it)->eds.end(); iz++){
if((*iz)->support['X'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [style=dashed,penwidth=" << (*iz)->support['X'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [style=dashed,penwidth=" << (*iz)->support['X'] << "];\n";
}
}
if((*iz)->support['R'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [color=yellow,penwidth=" << (*iz)->support['R'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [color=yellow,penwidth=" << (*iz)->support['R'] << "];\n";
}
}
if((*iz)->support['M'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [color=magenta,penwidth=" << (*iz)->support['M'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [color=magenta,penwidth=" << (*iz)->support['M'] << "];\n";
}
}
if((*iz)->support['V'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [colo\
r=green,penwidth=" << (*iz)->support['V'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [colo\
r=green,penwidth=" << (*iz)->support['V'] << "];\n";
}
}
if((*iz)->support['L'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [color=brown,penwidth=" << (*iz)->support['L'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [color=brown,penwidth=" << (*iz)->support['L'] << "];\n";
}
}
if((*iz)->support['H'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [color=purple,penwidth=" << (*iz)->support['H'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [color=purple,penwidth=" << (*iz)->support['H'] << "];\n";
}
}
if((*iz)->support['S'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [style=dotted,penwidth=" << (*iz)->support['S'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [style=dotted,penwidth=" << (*iz)->support['S'] << "];\n";
}
}
if((*iz)->support['I'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [color=red,penwidth=" << (*iz)->support['I'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [color=red,penwidth=" << (*iz)->support['I'] << "];\n";
}
}
if((*iz)->support['D'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [color=blue,penwidth=" << (*iz)->support['D'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [color=blue,penwidth=" << (*iz)->support['D'] << "];\n";
}
}
if((*iz)->support['A'] > 0){
if((*it)->pos != (*iz)->L->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->L->seqid << "." << (*iz)->L->pos << " [c\
olor=orange,penwidth=" << (*iz)->support['A'] << "];\n";
}
if((*it)->pos != (*iz)->R->pos){
ss << " " << (*it)->seqid << "." << (*it)->pos << " -- " << (*iz)->R->seqid << "." << (*iz)->R->pos << " [c\
olor=orange,penwidth=" << (*iz)->support['A'] << "];\n";
}
}
}
}
ss << "}";
return ss.str();
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : vector<nodes *>, and a breakpoint
Function does : checks if inversion meets basic requirements
Function returns: NA
*/
void doubleCheckIns(std::vector<breakpoint *> & bks, unpairedSVs & up){
for(std::vector<breakpoint *>::iterator it = bks.begin();
it != bks.end(); it++){
if((*it)->IsMasked()){
continue;
}
if((*it)->getType() != 'I'){
continue;
}
if(((*it)->getTooCloseCount() > 2 && (*it)->getInsCount() > 2)
|| (*it)->getInsCount() > 4){
// switching starts because one is a terminal node
if((*it)->nodeR->eds.size() > (*it)->nodeL->eds.size()){
node * tmp;
tmp = (*it)->nodeR;
(*it)->nodeR = (*it)->nodeL;
(*it)->nodeL = tmp;
}
}
else{
(*it)->unSetPrint();
}
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : vector<nodes *>, and a breakpoint
Function does : checks if inversion meets basic requirements
Function returns: NA
*/
void doubleCheckInv(std::vector<breakpoint *> & bks, unpairedSVs & up){
for(std::vector<breakpoint *>::iterator it = bks.begin();
it != bks.end(); it++){
if((*it)->IsMasked()){
continue;
}
if((*it)->getType() != 'V'){
continue;
}
if((*it)->getClustFrac() > 0.1
&& (*it)->getSameStrandCount() > 2
&& (*it)->getInvCount() > 2 ){
}
else{
if((*it)->getLength() < 500
&& ((*it)->getSameStrandCount() < 3
|| (*it)->getInvCount() < 3
|| (*it)->getSplitReadCount() < 1)
){
(*it)->unSetPrint();
}
}
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : vector<nodes *>, and a breakpoint
Function does : finds the best within graph link
Function returns: string
*/
void doubleCheckDup(std::vector<breakpoint *> & bks, unpairedSVs & up){
for(std::vector<breakpoint *>::iterator it = bks.begin();
it != bks.end(); it++){
if((*it)->IsMasked()){
continue;
}
if((*it)->getType() != 'U'){
continue;
}
if(((*it)->getClustFrac() > 0.1 &&
(*it)->getEvertCount() > 1
&& (*it)->getDupCount() > 1)
|| ((*it)->getSplitReadCount() > 3 && (*it)->getLength() < 500)
){
}
else{
(*it)->unSetPrint();
}
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : vector<nodes *>, and a breakpoint
Function does : finds the best within graph link
Function returns: string
*/
void doubleCheckDel(std::vector<breakpoint *> & bks, unpairedSVs & up){
for(std::vector<breakpoint *>::iterator it = bks.begin();
it != bks.end(); it++){
if((*it)->IsMasked()){
continue;
}
if((*it)->getType() != 'D'){
continue;
}
if(((*it)->getClustFrac() > 0.1
&& (*it)->getTooFarCount() > 1
&& (*it)->getDelCount() > 1 )
|| ((*it)->getInternalDelCount() > 2 && (*it)->getSplitReadCount() > 2)
){
}
else{
(*it)->unSetPrint();
up[(*it)->getType()][(*it)->nodeL->seqid][(*it)->nodeL->pos] = *it;
up[(*it)->getType()][(*it)->nodeR->seqid][(*it)->nodeR->pos] = *it;
}
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : unpairedSVs & up, type char
Function does : tries to cluster and link deletions
Function returns: NA
*/
void clusterUnlinked(unpairedSVs & up, char type){
for(unpairedSVsChr::iterator it = up[type].begin();
it != up[type].end(); it++ ){
for(unpairedSVsTypePos::iterator iz = it->second.begin();
iz != it->second.end(); iz++){
unpairedSVsTypePos::iterator forwardScan = iz;
forwardScan++;
for(; forwardScan != it->second.end(); forwardScan++){
if(iz->second->delClusterCheck(iz->second->nodeL,
forwardScan->second->nodeR)){
std::cerr << "R hit " << iz->second->nodeL->pos
<< " " << forwardScan->second->nodeR->pos
<< " " << iz->second->getClustFrac()
<< " " << iz->second->getNClustered()
<< " " << iz->second->getAvgDist() << std::endl;
}
if(iz->second->delClusterCheck(iz->second->nodeL,
forwardScan->second->nodeL)){
std::cerr << "L hit " << iz->second->nodeL->pos
<< " " << forwardScan->second->nodeL->pos
<< " " << iz->second->getClustFrac()
<< " " << iz->second->getNClustered()
<< " " << iz->second->getAvgDist() << std::endl;
}
}
}
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : vector<nodes *>, and a breakpoint
Function does : finds the best within graph link
Function returns: string
*/
void findPairs(vector<node*> & tree,
breakpoint * bp,
map<int, map<int, breakpoint* > > & lookup){
if(tree.size() < 3 || tree.size() > 200){
bp->setMasked();
return;
}
int lmax = 0;
int rmax = 0;
int tmax = 0;
int edsn = 0;
node * finalL = NULL;
node * finalR = NULL;
for(vector<node *>::iterator lit = tree.begin();
lit != tree.end(); lit++){
for(vector<node *>::iterator rit = tree.begin();
rit != tree.end(); rit++){
if(*lit == *rit){
continue;
}
if(!connectedNode(*lit, *rit)){
continue;
}
int lmaxTmp = getSupport(*lit);
int rmaxTmp = getSupport(*rit);
int tmaxTmp = lmaxTmp + rmaxTmp;
int edsnTmp = ((*lit)->eds.size() + (*rit)->eds.size());
if(tmaxTmp > tmax){
finalL = *lit;
finalR = *rit;
lmax = lmaxTmp;
rmax = rmaxTmp;
tmax = tmaxTmp;
edsn = edsnTmp;
}
}
}
bp->add(finalL);
bp->add(finalR);
if(finalL->eds.size() == 1 || finalR->eds.size() == 1){
if(finalL->eds.front()->support['D'] < 5 &&
finalL->eds.front()->support['I'] < 5 &&
finalL->eds.front()->support['S'] < 5 &&
finalL->eds.front()->support['V'] < 5 &&
finalR->eds.front()->support['D'] < 5 &&
finalR->eds.front()->support['I'] < 5 &&
finalR->eds.front()->support['S'] < 5 &&
finalR->eds.front()->support['V'] < 5 ){
bp->setBadPair();
}
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : processes trees
Function does : tries to define type and breakpoints
Function returns: NA
*/
void gatherTrees(vector<vector<node *> > & trees){
map<int, map<int, int> > lookup;
for(map<int, map<int, node* > >::iterator it = globalGraph.nodes.begin();
it != globalGraph.nodes.end(); it++){
for(map<int, node*>::iterator itt = it->second.begin();
itt != it->second.end(); itt++){
if(lookup[it->first].find(itt->first) != lookup[it->first].end() ){
}
else{
lookup[it->first][itt->first] = 1;
vector<node *> tree;
getTree(globalGraph.nodes[it->first][itt->first], tree);
for(vector<node *>::iterator ir = tree.begin(); ir != tree.end(); ir++){
lookup[(*ir)->seqid][(*ir)->pos] = 1;
}
trees.push_back(tree);
}
}
}
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : nothing
Function does : dumps and shrinks graph
Function returns: NA
*/
void dump(vector< vector< node *> > & allTrees){
ofstream graphOutFile;
graphOutFile.open(globalOpts.graphOut);
for(vector< vector<node *> >::iterator it = allTrees.begin();
it != allTrees.end(); it++){
graphOutFile << dotviz(*it) << endl << endl;
}
graphOutFile.close();
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : bam file
Function does : generates stats for bam file
Function returns: NA
*/
void gatherBamStats(string & targetfile){
omp_set_lock(&lock);
int quals [126];
memcpy(quals, SangerLookup, 126*sizeof(int));
cerr << "INFO: gathering stats (may take some time) for bam: " << targetfile << endl;
omp_unset_lock(&lock);
vector<double> alIns ;
vector<double> nReads ;
vector<double> randomSWScore;
vector<int> mapQuality ;
BamReader bamR;
if(!bamR.Open(targetfile) ){
cerr << "FATAL: cannot find - or - read : " << targetfile << endl;
exit(1);
}
if(! bamR.LocateIndex()){
vector<string> fileName = split(targetfile, ".");
fileName.back() = "bai";
string indexName = join(fileName, ".");
if(! bamR.OpenIndex(indexName) ){
cerr << "FATAL: cannot find bam index." << endl;
}
}
SamHeader SH = bamR.GetHeader();
if(!SH.HasSortOrder()){
cerr << "FATAL: sorted bams must have the @HD SO: tag in each SAM header." << endl;
exit(1);
}
if(!SH.HasReadGroups()){
cerr << endl;
cerr << "FATAL: No @RG detected in header. WHAM uses \"SM:sample\"." << endl;
cerr << endl;
}
SamReadGroupDictionary RG = SH.ReadGroups;
string SM;
if(!RG.Begin()->HasSample()){
cerr << endl;
cerr << "FATAL: No SM tag in bam file." << endl;
exit(1);
cerr << endl;
}
SM = RG.Begin()->Sample;
omp_set_lock(&lock);
globalOpts.SMTAGS[targetfile] = SM;
omp_unset_lock(&lock);
RefVector sequences = bamR.GetReferenceData();
int i = 0; // index for while loop
int n = 0; // number of reads
BamAlignment al;
int qsum = 0;
int qnum = 0;
int fail = 0;
while(i < 8 || n < 100000){
if((n % 10000) == 0 && fail < 10){
omp_set_lock(&lock);
cerr << "INFO: processed " << n << " reads for: " << targetfile << endl;
omp_unset_lock(&lock);
}
fail += 1;
if(fail > 1000000 && (! globalOpts.keepTrying) ){
cerr << "FATAL: Unable to gather stats on bamfile: " << targetfile << endl;
cerr << "INFO: Consider using -z if bamfile was split by region." << endl;
exit(1);
}
uint max = sequences.size() ;
if(globalOpts.lastSeqid > 0){
max = globalOpts.lastSeqid;
}
int randomChr = 0;
bool exclude = true;
while(exclude){
if(sequences.size() > 1){
int prand = rand() % (max -1);
if(globalOpts.toSkip.find(sequences[prand].RefName) == globalOpts.toSkip.end()){
randomChr = prand;
exclude = false;
}
if(globalOpts.toInclude.size() > 0
&& globalOpts.toInclude.find(sequences[prand].RefName) == globalOpts.toInclude.end()){
exclude = true;
}
}
else{
exclude = false;
}
}
int randomPos = rand() % (sequences[randomChr].RefLength -1);
int randomEnd = randomPos + 2000;
if(randomEnd > sequences[randomChr].RefLength){
continue;
}
if(! bamR.SetRegion(randomChr, randomPos, randomChr, randomEnd)){
cerr << "FATAL: Cannot set random region";
exit(1);
}
if(!bamR.GetNextAlignmentCore(al)){
continue;
}
i++;
long int cp = al.GetEndPosition(false,true);
readPileUp allPileUp;
while(bamR.GetNextAlignment(al)){
if(!al.IsMapped() || ! al.IsProperPair()){
continue;
}
string any;
if(al.GetTag("XA", any)){
continue;
}
if(al.GetTag( globalOpts.saT, any)){
continue;
}
if(al.IsDuplicate()){
continue;
}
string squals = al.Qualities;
// summing base qualities (quals is the lookup)
for(unsigned int q = 0 ; q < squals.size(); q++){
qsum += quals[ int(squals[q]) ];
qnum += 1;
if(quals[int(squals[q])] < 0){
omp_set_lock(&lock);
cerr << endl;
cerr << "FATAL: base quality is not sanger or illumina 1.8+ (0,41) in file : " << targetfile << endl;
cerr << "INFO : offending qual string : " << squals << endl;
cerr << "INFO : offending qual char : " << squals[q] << endl;
cerr << "INFO : -1 qual ; qual ; qual +1: " << quals[ int(squals[q]) -1 ] << " " << quals[ int(squals[q]) ] << " " << quals[ int(squals[q]) +1 ] << endl;
cerr << "INFO : rescale qualities or contact author for additional quality ranges" << endl;
cerr << endl;
omp_unset_lock(&lock);
exit(1);
}
}
if(al.Position > cp){
allPileUp.purgePast(&cp);
cp = al.GetEndPosition(false,true);
nReads.push_back(allPileUp.currentData.size());
}
if(al.IsMapped()
&& al.IsMateMapped()
&& abs(double(al.InsertSize)) < 10000
&& al.RefID == al.MateRefID
){
allPileUp.processAlignment(al);
alIns.push_back(abs(double(al.InsertSize)));
mapQuality.push_back(al.MapQuality);
n++;
}
}
}
bamR.Close();
sort(alIns.begin(), alIns.end() );
int index = 0;
if((alIns.size() % 2) != 0 ){
index = ((alIns.size()+1)/2)-1;
}
else{
index = (alIns.size()/2)-1;
}
double median = alIns[index];
double mu = mean(alIns );
double mud = mean(nReads );
double variance = var(alIns, mu );
double sd = sqrt(variance );
double sdd = sqrt(var(nReads, mud ));
double muQ = mean(mapQuality);
omp_set_lock(&lock);
insertDists.mus[ targetfile ] = mu;
insertDists.sds[ targetfile ] = sd;
insertDists.avgD[ targetfile ] = mud;
insertDists.low[ targetfile ] = insertDists.mus[targetfile]
- (1.6*insertDists.sds[targetfile]);
if(insertDists.low[ targetfile ] < 0 ){
insertDists.low[ targetfile ] = 0;
}
insertDists.upr[ targetfile ] = insertDists.mus[targetfile]
+ (2.5*insertDists.sds[targetfile]);
stringstream whereTo;
whereTo << "INFO: for Sample:" << SM << endl
<< " STATS: " << SM << ": mean depth ..............: " << mud
<< std::endl
<< " STATS: " << SM << ": sd depth ................: " << sdd
<< std::endl
<< " STATS: " << SM << ": mean insert length: .....: " << insertDists.mus[targetfile]
<< std::endl
<< " STATS: " << SM << ": median insert length ....: " << median
<< std::endl
<< " STATS: " << SM << ": sd insert length ........: " << insertDists.sds[targetfile]
<< std::endl
<< " STATS: " << SM << ": lower insert cutoff .....: " << insertDists.low[targetfile]
<< std::endl
<< " STATS: " << SM << ": upper insert cutoff .....: " << insertDists.upr[targetfile]
<< std::endl
<< " STATS: " << SM << ": average base quality ....: " << double(qsum)/double(qnum)
<< std::endl
<< " STATS: " << SM << ": average mapping quality .: " << muQ
<< std::endl
<< " STATS: " << SM << ": number of reads used ....: " << n
<< std::endl << std::endl;
if(globalOpts.statsOnly){
cout << whereTo.str();
}
else{
cerr << whereTo.str();
}
omp_unset_lock(&lock);
}
int avgP(node * n){
double pi = 0;
double ni = 0;
for(vector<edge *>::iterator it = n->eds.begin(); it != n->eds.end(); it++){
if((*it)->L->pos == n->pos){
pi += double((*it)->R->pos) * double((*it)->support['H']);
ni += double((*it)->support['H']) ;
}
else{
pi += double((*it)->L->pos) * double((*it)->support['H']);
ni += double((*it)->support['H']) ;
}
}
return int(double(pi) / double(ni));
}
void allStats(void){
#pragma omp parallel for schedule(dynamic, 1)
for(unsigned int i = 0; i < globalOpts.targetBams.size(); i++){
gatherBamStats(globalOpts.targetBams[i]);
}
if(globalOpts.statsOnly){
cerr << "INFO: Exiting as -s flag is set." << endl;
cerr << "INFO: WHAM finished normally, goodbye! " << endl;
exit(0);
}
}
void loadReads(std::vector<RefData> & sequences){
// load bam has openMP inside for running regions quickly
if(globalOpts.svs.empty()){
cerr << "INFO: Loading discordant reads into forest." << endl;
for(vector<string>::iterator bam = globalOpts.targetBams.begin();
bam != globalOpts.targetBams.end(); bam++){
cerr << "INFO: Reading: " << *bam << endl;
loadBam(*bam);
for(map<int, map<int, node * > >::iterator seqid
= globalGraph.nodes.begin();
seqid != globalGraph.nodes.end(); seqid++){
cerr << "INFO: Number of putative breakpoints for: "
<< sequences[seqid->first].RefName << ": "
<< globalGraph.nodes[seqid->first].size() << endl;
}
}
cerr << "INFO: Finished loading reads." << endl;
}
}
//------------------------------- MAIN --------------------------------
/*
Comments:
*/
int main( int argc, char** argv)
{
globalOpts.nthreads = 1 ;
globalOpts.lastSeqid = 0 ;
globalOpts.MQ = 20 ;
globalOpts.NM = 10 ;
globalOpts.minPairMatch = 100 ;
globalOpts.saT = "SA" ;
globalOpts.keepTrying = false;
globalOpts.statsOnly = false;
globalOpts.skipGeno = false;
globalOpts.noInterSeqid = true ;
int parse = parseOpts(argc, argv);
if(parse != 1){
cerr << "FATAL: unable to parse command line correctly." << endl;
printHelp();
exit(1);
}
omp_set_num_threads(globalOpts.nthreads);
if(globalOpts.fasta.empty()){
cerr << "FATAL: no reference fasta provided." << endl << endl;
printHelp();
exit(1);
}
// gather stats for each bam (depth, insert l, ...)
allStats();
RefVector sequences;
BamMultiReader mr;
if(! mr.Open(globalOpts.targetBams)){
cerr << "FATAL: issue opening all bams to extract header" << endl;
exit(1);
}
else{
sequences = mr.GetReferenceData();
mr.Close();
}
int s = 0;
for(vector<RefData>::iterator it = sequences.begin();
it != sequences.end(); it++){
inverse_lookup[(*it).RefName] = s;
forward_lookup[s] = (*it).RefName;
s+= 1;
}
loadReads(sequences);
vector<breakpoint*> allBreakpoints;
vector<vector<node*> > globalTrees;
map<int, map<int, breakpoint*> > breakpointLookup;
if(globalOpts.svs.empty()){
cerr << "INFO: Gathering graphs from forest." << endl;
gatherTrees(globalTrees);
for(unsigned int i = 0 ; i < globalTrees.size(); i++){
breakpoint * tmp = new breakpoint;
allBreakpoints.push_back(tmp);
}
cerr << "INFO: Matching breakpoints." << endl;
#pragma omp parallel for schedule(dynamic, 3)
for(unsigned int i = 0 ; i < globalTrees.size(); i++){
if((i % 10000) == 0){
omp_set_lock(&lock);
cerr << "INFO: Processed " << i
<< "/" << globalTrees.size() << " graphs" << endl;
omp_unset_lock(&lock);
}
findPairs(globalTrees[i], allBreakpoints[i], breakpointLookup);
if(allBreakpoints[i]->IsMasked() ){
continue;
}
allBreakpoints[i]->countSupportType();
allBreakpoints[i]->calcType();
allBreakpoints[i]->delClusterCheck();
allBreakpoints[i]->invClusterCheck();
allBreakpoints[i]->dupClusterCheck();
allBreakpoints[i]->getRefBases(globalOpts.fasta, forward_lookup);
}
cerr << "INFO: Printing." << endl;
unpairedSVs unMatched;
doubleCheckDel(allBreakpoints, unMatched );
doubleCheckDup(allBreakpoints, unMatched );
doubleCheckInv(allBreakpoints, unMatched );
doubleCheckIns(allBreakpoints, unMatched );
// std::cerr << "INFO: clustering unlinked deletions." << std::endl;
// clusterUnlinked(unMatched, 'D');
printVCF(allBreakpoints);
std::cerr << "INFO: done processing trees" << std::endl;
if(!globalOpts.graphOut.empty()){
dump(globalTrees);
}
}
if(!globalOpts.graphOut.empty()){
dump(globalTrees);
}
for(vector<breakpoint*>::iterator bks = allBreakpoints.begin();
bks != allBreakpoints.end(); bks++){
delete (*bks);
}
cerr << "INFO: WHAM finished normally, goodbye! " << endl;
return 0;
}