src/bin/wham.cpp
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <cmath>
#include <time.h>
#include <algorithm>
#include "split.h"
#include "KMERUTILS.h"
#include "fastahack/Fasta.h"
#include "ssw_cpp.h"
// openMP - swing that hammer
#include <omp.h>
// bamtools and my headers
#include "api/BamMultiReader.h"
#include "readPileUp.h"
// msa headers
#include <seqan/align.h>
#include <seqan/graph_msa.h>
using namespace std;
using namespace BamTools;
struct cigar{
char type;
unsigned int length;
};
struct regionDat{
int seqidIndex ;
int start ;
int end ;
};
struct indvDat{
bool support ;
string genotype ;
int genotypeIndex ;
int nReads ;
int mappedPairs ;
int nAboveAvg ;
int notMapped ;
int mateMissing ;
int sameStrand ;
int nClipping ;
int mateCrossChromosome ;
int maxLength ;
double nBad ;
double nGood ;
double insertSum ;
double insertMean ;
double lengthSum ;
double clipped ;
vector<double> inserts;
vector<double> hInserts;
vector<long double> gls;
vector< BamAlignment > alignments;
vector<int> badFlag;
vector<int> MapQ;
map<int, vector<string> > cluster;
};
struct insertDat{
map<string, double> mus; // mean of insert length for each indvdual across 1e6 reads
map<string, double> sds; // standard deviation
map<string, double> lq ; // 25% of data
map<string, double> up ; // 75% of the data
map<string, double> avgD;
double overallDepth;
} insertDists;
struct global_opts {
vector<string> targetBams ;
vector<string> backgroundBams ;
vector<string> all ;
int nthreads ;
string fasta ;
string seqid ;
string bed ;
string mask ;
int qualLookup[126] ;
int qualCut ; // average base pair quality for 5nt left or right of the breakpoint
int MQ ; // required mapping quality for soft-clipped reads
unsigned int min ; // number of soft-clips with the same start or end
vector<int> region ;
} globalOpts;
struct info_field{
double lrt;
double taf;
double baf;
double aaf;
double nat;
double nbt;
double nab;
double nbb;
double tgc;
double bgc;
};
template<typename T>
inline bool aminan(T value)
{
return value != value;
}
static const char *optString ="ht:f:b:r:x:e:m:q:p:i";
// this lookup is good for sanger and illumina 1.8+
int SangerLookup[126] = {-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 0-9 1-10
-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 10-19 11-20
-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 20-29 21-30
-1,-1, 0, 1, 2, 3, 4, 5, 6, 7, // 30-39 31-40
8, 9,10,11,12, 13,14,15,16,17, // 40-49 41-50
18,19,20,21,22, 23,24,25,26,27, // 50-59 51-60
28,29,30,31,32, 33,34,35,36,37, // 60-69 61-70
38,39,40,41,42, 43,44,45,46,47, // 70-79 71-80
48,49,50,51,52, 53,54,55,56,57, // 80-89 81-90
58,59,60,61,62, 63,64,65,66,67, // 90-99 91-100
68,69,70,71,72, 73,74,75,76,77, // 100-109 101-110
78,79,80,81,82, 83,84,85,86,87, // 110-119 111-120
88,89,90,91,92, 93 }; // 120-119 121-130
int IlluminaOneThree[126] = {-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 0-9 1-10
-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 10-19 11-20
-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 20-29 21-30
-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 30-39 31-40
-1,-1,-1,-1,-1 -1,-1,-1,-1,-1, // 40-49 41-50
-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 50-59 51-60
-1,-1,-1,-1,-1, 0, 1, 2, 3, 4, // 60-69 61-70
5, 6, 7, 8, 9, 10,11,12,13,14, // 70-79 71-80
15,16,17,18,19, 20,21,22,23,24, // 80-89 81-90
25,26,27,28,29, 30,31,32,33,34, // 90-99 91-100
35,36,37,38,39, 40,-1,-1,-1,-1, // 100-109 101-110
-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, // 110-119 111-120
-1,-1,-1,-1,-1, -1,-1 }; // 120-119 121-130
// this lock prevents threads from printing on top of each other
omp_lock_t lock;
bool sortStringSize(string i, string j) {return (i.size() < j.size());}
//ripped from EKG @ github Shannon Entropy
double entropy(string& st) {
vector<char> stvec(st.begin(), st.end());
set<char> alphabet(stvec.begin(), stvec.end());
vector<double> freqs;
for (set<char>::iterator c = alphabet.begin(); c != alphabet.end(); ++c) {
int ctr = 0;
for (vector<char>::iterator s = stvec.begin(); s != stvec.end(); ++s) {
if (*s == *c) {
++ctr;
}
}
freqs.push_back((double)ctr / (double)stvec.size());
}
double ent = 0;
double ln2 = log(2);
for (vector<double>::iterator f = freqs.begin(); f != freqs.end(); ++f) {
ent += *f * log(*f)/ln2;
}
ent = -ent;
return ent;
}
// rounding ints
int RoundNum(int num)
{
int rem = num % 10;
return rem >= 5 ? (num - rem + 10) : (num - rem);
}
int roundUp(int numToRound, int multiple)
{
if(multiple == 0)
{
return numToRound;
}
int remainder = numToRound % multiple;
if (remainder == 0)
return numToRound;
return numToRound + multiple - remainder;
}
void initInfo(info_field * s){
s->lrt = 0;
s->taf = 0.000001;
s->baf = 0.000001;
s->aaf = 0.000001;
s->nat = 0;
s->nbt = 0;
s->nab = 0;
s->nbb = 0;
s->tgc = 0;
s->bgc = 0;
}
void initIndv(indvDat * s){
s->support = false;
s->genotype = "./.";
s->genotypeIndex = -1;
s->nBad = 0;
s->nClipping = 0;
s->nGood = 0;
s->nReads = 0;
s->nAboveAvg = 0;
s->notMapped = 0;
s->mappedPairs = 0;
s->mateMissing = 0;
s->insertSum = 0;
s->sameStrand = 0;
s->maxLength = 0;
s->mateCrossChromosome = 0;
s->lengthSum = 0;
s->clipped = 0;
s->alignments.clear();
s->inserts.clear();
s->hInserts.clear();
s->badFlag.clear();
s->MapQ.clear();
}
string collapseCigar(vector<CigarOp> & v){
stringstream ss;
for(vector<CigarOp>::iterator it = v.begin(); it != v.end(); it++){
ss << (*it).Length;
ss << (*it).Type;
}
return ss.str();
}
void printHeader(void){
cout << "##fileformat=VCFv4.1" << endl;
cout << "##INFO=<ID=LRT,Number=1,Type=Float,Description=\"Likelihood Ratio Test statistic\">" << endl;
cout << "##INFO=<ID=WAF,Number=3,Type=Float,Description=\"Allele frequency of: background,target,combined\">" << endl;
cout << "##INFO=<ID=GC,Number=2,Type=Integer,Description=\"Number of called genotypes in: background,target\">" << endl;
cout << "##INFO=<ID=AT,Number=15,Type=Float,Description=\"Pileup attributes\">" << endl;
cout << "##INFO=<ID=CF,Number=1,Type=Float,Description=\"Fraction of reads with more than three cigar operations\">" << endl;
cout << "##INFO=<ID=CISTART,Number=2,Type=Integer,Description=\"PE confidence interval around POS\">" << endl;
cout << "##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"PE confidence interval around END\">" << endl;
cout << "##INFO=<ID=PU,Number=1,Type=Integer,Description=\"Number of reads supporting position\">" << endl;
cout << "##INFO=<ID=SU,Number=1,Type=Integer,Description=\"Number of supplemental reads supporting position\">" << endl;
cout << "##INFO=<ID=CU,Number=1,Type=Integer,Description=\"Number of neighboring all soft clip clusters across all individuals at pileup position\">" << endl;
cout << "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Number of reads at pileup position across individuals passing filters\">" << endl;
cout << "##INFO=<ID=NC,Number=1,Type=Float,Description=\"Number of soft clipped sequences collapsed into consensus\">" << endl;
cout << "##INFO=<ID=MQ,Number=1,Type=Float,Description=\"Average mapping quality\">" << endl;
cout << "##INFO=<ID=MQF,Number=1,Type=Float,Description=\"Fraction of reads with MQ less than 50\">" << endl;
cout << "##INFO=<ID=SP,Number=3,Type=Integer,Description=\"Number of reads supporting endpoint: mate-position,split-read,alternative-mapping\">" << endl;
cout << "##INFO=<ID=CHR2,Number=3,Type=String,Description=\"Other seqid\">" << endl;
cout << "##INFO=<ID=DI,Number=1,Type=Character,Description=\"Consensus is from front or back of pileup: f,b\">" << endl;
cout << "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">" << endl;
cout << "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between POS and END\">" << endl;
cout << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl;
cout << "##FORMAT=<ID=GL,Number=A,Type=Float,Description=\"Genotype Likelihood\">" << endl;
cout << "##FORMAT=<ID=NR,Number=1,Type=Integer,Description=\"Number of reads that do not support a SV\">" << endl;
cout << "##FORMAT=<ID=NA,Number=1,Type=Integer,Description=\"Number of reads supporting a SV\">" << endl;
cout << "##FORMAT=<ID=NS,Number=1,Type=Integer,Description=\"Number of reads with a softclip at POS for individual\">" << endl;
cout << "##FORMAT=<ID=RD,Number=1,Type=Integer,Description=\"Number of reads passing filters\">" << endl;
cout << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" << "\t";
for(unsigned int b = 0; b < globalOpts.all.size(); b++){
cout << globalOpts.all[b] ;
if(b < globalOpts.all.size() - 1){
cout << "\t";
}
}
cout << endl;
}
string joinComma(vector<string> & strings){
stringstream ss;
for(unsigned int i = 0; i < strings.size() -1 ; i++){
ss << strings[i] << "," ;
}
ss << strings.back();
return ss.str();
}
string printIndvDat( indvDat * d ){
stringstream ss;
ss << "Genotype : .......... " << d->genotype << endl;
ss << "Genotype index : .......... " << d->genotypeIndex << endl;
ss << "Number of reads: .......... " << d->badFlag.size() << endl;
ss << "Number of mapped mates: ... " << d->mappedPairs << endl;
ss << "Number of odd insert size: " << d->nAboveAvg << endl;
ss << "Number of reads not mapped: " << d->notMapped << endl;
ss << "Number of mates missing: .. " << d->mateMissing << endl;
ss << "Number of mates same strand:" << d->sameStrand << endl;
ss << "Number of reads clipped: .. " << d->nClipping << endl;
ss << "Number of alternative reads:" << d->nBad << endl;
ss << "Number of reference reads: " << d->nGood << endl;
ss << "Genotype likelihoods: ..... " << d->gls[0]
<< ":" << d->gls[1]
<< ":" << d->gls[2]
<< endl;
ss << endl;
ss << "Reads: " << endl;
int z = 0;
for(vector< BamAlignment >::iterator it = d->alignments.begin(); it != d->alignments.end(); it++){
ss << " " << (*it).Name << " "
<< d->badFlag[z] << " "
<< (*it).RefID << " "
<< (*it).Position << " "
<< (*it).GetEndPosition(false,true) << " "
<< (*it).Position + (*it).Length << " "
<< (*it).MapQuality << " "
<< (*it).MateRefID << " "
<< (*it).MatePosition << " "
<< (*it).InsertSize << " "
<< collapseCigar((*it).CigarData) << " "
<< (*it).AlignmentFlag << " "
<< (*it).QueryBases
<< endl;
z++;
}
return ss.str();
}
string join(vector<string> strings){
string joined = "";
for(vector<string>::iterator sit = strings.begin(); sit != strings.end(); sit++){
joined = joined + " " + (*sit) + "\n";
}
return joined;
}
void printVersion(void){
cerr << "Version: " << VERSION << endl;
cerr << "Contact: zev.kronenberg [at] gmail.com " << endl;
cerr << "Notes : -If you find a bug, please open a report on github!" << endl;
cerr << " -The options -m,-q, and -p, control the sensitivity and specificity" << endl;
cerr << " -If you have exome data use the -e option for best performance " << endl;
cerr << endl;
}
void printHelp(void){
cerr << "usage : WHAM-BAM -f <STRING> -m <INT> -q <INT> -p <INT> -x <INT> -r <STRING> -e <STRING> -t <STRING> -b <STRING> " << endl << endl;
cerr << "example: WHAM-BAM if my.fasta -m 2 -q 15 -p 10 -x 20 -r chr1:0-10000 -e genes.bed -t a.bam,b.bam -b c.bam,d.bam" << endl << endl;
cerr << "required : t <STRING> -- comma separated list of target bam files " << endl ;
cerr << "required : f <STRING> -- reference sequence reads were aligned to " << endl ;
cerr << "option : b <STRING> -- comma separated list of background bam files " << endl ;
cerr << "option : r <STRING> -- a genomic region in the format \"seqid:start-end\"" << endl ;
cerr << "option : x <INT> -- set the number of threads, otherwise max [all] " << endl ;
cerr << "option : e <STRING> -- a bedfile that defines regions to score [none] " << endl ;
cerr << "option : m <INT> -- minimum number of soft-clips supporting " << endl ;
cerr << " START [3] " << endl ;
cerr << "option : q <INT> -- exclude soft-cliped sequences with average base " << endl ;
cerr << " quality below phred scaled value (0-41) [20] " << endl ;
cerr << "option : p <INT> -- exclude soft-clipped reads with mapping quality " << endl ;
cerr << " below value [15] " << endl ;
cerr << "option : i -- base quality is Illumina 1.3+ Phred+64 " << endl ;
cerr << endl;
printVersion();
}
void parseOpts(int argc, char** argv){
int opt = 0;
globalOpts.mask = "NA";
globalOpts.bed = "NA";
globalOpts.qualCut = 20 ;
globalOpts.MQ = 15 ;
globalOpts.min = 3 ;
opt = getopt(argc, argv, optString);
while(opt != -1){
switch(opt){
case 'i':
{
cerr << "INFO: base quality is Illumina 1.3+ Phred+64 NOT sanger Phred+33" << endl;
for(unsigned int z = 0; z < 126; z++){
cerr << z << " " << SangerLookup[z] << " " << IlluminaOneThree[z] << endl;
SangerLookup[z] = IlluminaOneThree[z];
}
break;
}
case 'p':
{
globalOpts.MQ = atoi(((string)optarg).c_str());
cerr << "INFO: WHAM-BAM skip soft-clips with mapping quaity below: " << globalOpts.MQ << endl;
break;
}
case 'q':
{
globalOpts.qualCut = atoi(((string)optarg).c_str());
cerr << "INFO: WHAM-BAM skip soft-clips with average base quality below: " << globalOpts.qualCut << endl;
break;
}
case 'f':
{
globalOpts.fasta = optarg;
cerr << "INFO: WHAM-BAM will using the following fasta: " << globalOpts.fasta << endl;
break;
}
case 'm':
{
globalOpts.min = atoi(((string)optarg).c_str());
cerr << "INFO: WHAM-BAM requires : " << globalOpts.min << " soft-clips to score breakpoint " << endl;
break;
}
case 'e':
{
globalOpts.bed = optarg;
cerr << "INFO: WHAM-BAM will only score within bed coordiates provided: " << globalOpts.bed << endl;
break;
}
case 'x':
{
globalOpts.nthreads = atoi(((string)optarg).c_str());
cerr << "INFO: OpenMP will roughly use " << globalOpts.nthreads << " threads" << endl;
break;
}
case 't':
{
globalOpts.targetBams = split(optarg, ",");
cerr << "INFO: target bams:\n" << join(globalOpts.targetBams) ;
break;
}
case 'b':
{
globalOpts.backgroundBams = split(optarg, ",");
cerr << "INFO: background bams:\n" << join(globalOpts.backgroundBams) ;
break;
}
case 'h':
{
printHelp();
exit(1);
}
case 'r':
{
vector<string> tmp_region = split(optarg, ":");
if(tmp_region.size() != 2 || tmp_region[1].empty() || tmp_region[0].empty()){
cerr << "FATAL: region was not set correctly" << endl;
cerr << "INFO: region format: seqid:start-end" << endl;
exit(1);
}
vector<string> start_end = split(tmp_region[1], "-");
globalOpts.seqid = tmp_region[0];
globalOpts.region.push_back(atoi(start_end[0].c_str()));
globalOpts.region.push_back(atoi(start_end[1].c_str()));
if(start_end.size() !=2 || start_end[0].empty() || start_end[1].empty()){
cerr << "FATAL: region was not set correctly" << endl;
cerr << "INFO: region format: seqid:start-end" << endl;
exit(1);
}
cerr << "INFO: region set to: " << globalOpts.seqid << ":" << globalOpts.region[0] << "-" << globalOpts.region[1] << endl;
if(globalOpts.region.size() != 2){
cerr << "FATAL: incorrectly formatted region." << endl;
cerr << "FATAL: wham is now exiting." << endl;
exit(1);
}
break;
}
case '?':
{
printHelp();
exit(1);
}
default:
{
cerr << "FATAL: Failure to parse command line options." << endl;
cerr << "FATAL: Now exiting wham." << endl;
printHelp();
exit(1);
}
}
opt = getopt( argc, argv, optString );
}
if( globalOpts.targetBams.empty() && globalOpts.backgroundBams.empty() ){
cerr << "FATAL: Failure to specify target and/or background bam files." << endl;
cerr << "FATAL: Now exiting wham." << endl;
printHelp();
exit(1);
}
if(globalOpts.fasta.empty()){
cerr << "FATAL: Failure to specify fasta file." << endl;
cerr << "FATAL: Now exiting wham." << endl;
printHelp();
exit(1);
}
}
double bound(double v){
if(v <= 0.00001){
return 0.00001;
}
if(v >= 0.99999){
return 0.99999;
}
return v;
}
double mean(vector<int> & data){
double sum = 0;
for(vector<int>::iterator it = data.begin(); it != data.end(); it++){
sum += (*it);
}
return sum / data.size();
}
double mean(vector<double> & data){
double sum = 0;
for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
sum += (*it);
}
return sum / data.size();
}
double var(vector<double> & data, double mu){
double variance = 0;
for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
variance += pow((*it) - mu,2);
}
return variance / (data.size() - 1);
}
// gerates per bamfile statistics
void grabInsertLengths(string & targetfile){
omp_set_lock(&lock);
int quals [126];
memcpy(quals, SangerLookup, 126*sizeof(int));
omp_unset_lock(&lock);
vector<double> alIns;
vector<double> nReads;
BamReader bamR;
if(!bamR.Open(targetfile) ){
cerr << "FATAL: cannot find - or - read : " << targetfile << endl;
exit(1);
}
if(! bamR.LocateIndex()){
cerr << "FATAL: cannot find - or - open index for : " << targetfile << endl;
exit(1);
}
SamHeader SH = bamR.GetHeader();
if(!SH.HasSortOrder()){
cerr << "FATAL: sorted bams must have the @HD SO: tag in each SAM header." << endl;
exit(1);
}
RefVector sequences = bamR.GetReferenceData();
int i = 0; // index for while loop
int n = 0; // number of reads
BamAlignment al;
int qsum = 0;
int qnum = 0;
int fail = 0;
while(i < 5 || n < 100000){
fail += 1;
if(fail > 1000000){
cerr << "FATAL: was not able to gather stats on bamfile: " << targetfile << endl;
exit(1);
}
unsigned int max = 20;
if(sequences.size() < max){
max = sequences.size() ;
}
int randomChr = rand() % (max -1);
int randomPos = rand() % (sequences[randomChr].RefLength -1);
int randomEnd = randomPos + 2000;
if(randomEnd > sequences[randomChr].RefLength){
continue;
}
if(! bamR.SetRegion(randomChr, randomPos, randomChr, randomEnd)){
cerr << "FATAL: Cannot set random region";
exit(1);
}
if(!bamR.GetNextAlignmentCore(al)){
continue;
}
i++;
long int cp = al.GetEndPosition(false,true);
readPileUp allPileUp;
while(bamR.GetNextAlignment(al)){
if(!al.IsMapped() || ! al.IsProperPair()){
continue;
}
string any;
if(al.GetTag("XA", any)){
continue;
}
if(al.GetTag("SA", any)){
continue;
}
if(al.IsDuplicate()){
continue;
}
string squals = al.Qualities;
// summing base qualities (quals is the lookup)
for(unsigned int q = 0 ; q < squals.size(); q++){
qsum += quals[ int(squals[q]) ];
qnum += 1;
if(quals[int(squals[q])] < 0){
omp_set_lock(&lock);
cerr << endl;
cerr << "FATAL: base quality is not sanger or illumina 1.8+ (0,41) in file : " << targetfile << endl;
cerr << "INFO : offending qual string : " << squals << endl;
cerr << "INFO : offending qual char : " << squals[q] << endl;
cerr << "INFO : -1 qual ; qual ; qual +1: " << quals[ int(squals[q]) -1 ] << " " << quals[ int(squals[q]) ] << " " << quals[ int(squals[q]) +1 ] << endl;
cerr << "INFO : rescale qualities or contact author for additional quality ranges" << endl;
cerr << endl;
omp_unset_lock(&lock);
exit(1);
}
}
if(al.Position > cp){
allPileUp.purgePast(&cp);
cp = al.GetEndPosition(false,true);
nReads.push_back(allPileUp.currentData.size());
}
if(al.IsMapped()
&& al.IsMateMapped()
&& abs(double(al.InsertSize)) < 10000
&& al.RefID == al.MateRefID
){
allPileUp.processAlignment(al);
alIns.push_back(abs(double(al.InsertSize)));
n++;
}
}
}
bamR.Close();
double mu = mean(alIns );
double mud = mean(nReads );
double variance = var(alIns, mu );
double sd = sqrt(variance );
double sdd = sqrt(var(nReads, mud ));
omp_set_lock(&lock);
insertDists.mus[ targetfile ] = mu;
insertDists.sds[ targetfile ] = sd;
insertDists.avgD[ targetfile ] = mud;
cerr << "INFO: for file:" << targetfile << endl
<< " " << targetfile << ": mean depth: ......... " << mud << endl
<< " " << targetfile << ": sd depth: ......... " << sdd << endl
<< " " << targetfile << ": mean insert length: . " << insertDists.mus[targetfile] << endl
<< " " << targetfile << ": sd insert length: . " << insertDists.sds[targetfile] << endl
<< " " << targetfile << ": lower insert length: " << insertDists.mus[targetfile] -(2.5*insertDists.sds[targetfile]) << endl
<< " " << targetfile << ": upper insert length: " << insertDists.mus[targetfile] +(2.5*insertDists.sds[targetfile]) << endl
<< " " << targetfile << ": average base quality: " << double(qsum)/double(qnum) << " " << qsum << " " << qnum << endl
<< " " << targetfile << ": number of reads used: " << n << endl << endl;
omp_unset_lock(&lock);
}
void prepBams(BamMultiReader & bamMreader, string group){
string errorMessage ;
vector<string> files;
if(group == "target"){
files = globalOpts.targetBams;
}
if(group == "background"){
files = globalOpts.backgroundBams;
}
if(group == "all"){
files = globalOpts.all;
}
if(files.empty()){
cerr << "FATAL: no files ?" << endl;
exit(1);
}
bool attempt = true;
int tried = 0 ;
while(attempt && tried < 500){
// sleep(int(rand() % 10)+1);
if( bamMreader.Open(files) && bamMreader.LocateIndexes() ){
attempt = false;
}
else{
tried += 1;
}
}
if(attempt == true){
cerr << "FATAL: unable to open BAMs or indices after: " << tried << " attempts" << endl;
cerr << "Bamtools error message:\n" << bamMreader.GetErrorString() << endl;
cerr << "INFO : check bam names " << endl;
cerr << "INFO : check that the indicies are *.bam.bai " << endl;
cerr << "INFO : check that the ulimit is not being exceeded" << endl;
exit(1);
}
}
double logLbinomial(double x, double n, double p){
double ans = lgamma(n+1)-lgamma(x+1)-lgamma(n-x+1) + x * log(p) + (n-x) * log(1-p);
return ans;
}
double logDbeta(double alpha, double beta, double x){
double ans = 0;
ans += lgamma(alpha + beta) - ( lgamma(alpha) + lgamma(beta) );
ans += log( pow(x, (alpha -1) ) ) + log( pow( (1-x) , (beta-1)));
cerr << "alpha: " << alpha << "\t" << "beta: " << beta << "\t" << "x: " << x << "\t" << ans;
cerr << endl;
return ans;
}
double totalLL(vector<double> & data, double alpha, double beta){
double total = 0;
for(vector<double>::iterator it = data.begin(); it != data.end(); it++){
total += logDbeta(alpha, beta, (*it));
}
return total;
}
double unphred(double p){
return pow(10, (-p/10));
}
bool processGenotype(string & fname ,
indvDat * idat ,
double * totalAlt ,
double * totalAltGeno ,
double * relativeDepth,
insertDat * stats ){
string genotype = "./.";
long double aal = 0;
long double abl = 0;
long double bbl = 0;
if(idat->badFlag.size() < 2){
idat->gls.push_back(-255.0);
idat->gls.push_back(-255.0);
idat->gls.push_back(-255.0);
return true;
}
double nref = 0.0;
double nalt = 0.0;
if(idat->badFlag.size() > 1000){
std::random_shuffle ( idat->badFlag.begin(), idat->badFlag.end() );
}
int ri = 0;
double within_sample_clip_frq = double( idat->nClipping ) / double( idat->badFlag.size() ) ;
for(vector< int >::iterator rit = idat->badFlag.begin(); rit != idat->badFlag.end(); rit++){
if(ri > 1000){
break;
}
if(idat->MapQ[ri] == 0){
idat->MapQ[ri] = 1;
}
double mappingP = unphred(idat->MapQ[ri]);
#ifdef DEBUG
cerr << "Geno: clipping must reach 0.01 frq: " << within_sample_clip_frq << endl;
#endif
if( (*rit == 1 && ( within_sample_clip_frq > 0.01 )) && idat->nClipping > 1){
#ifdef DEBUG
cerr << "geno mq unphred: " << mappingP << " " << idat->MapQ[ri] << endl;
cerr << "geno aal abl bbl: " << aal << " " << abl << " " << bbl << endl;
#endif
nalt += 1;
aal += log((2-2) * (1-mappingP) + (2*mappingP)) ;
abl += log((2-1) * (1-mappingP) + (1*mappingP)) ;
bbl += log((2-0) * (1-mappingP) + (0*mappingP)) ;
}
else{
nref += 1;
aal += log((2 - 2)*mappingP + (2*(1-mappingP)));
abl += log((2 - 1)*mappingP + (1*(1-mappingP)));
bbl += log((2 - 0)*mappingP + (0*(1-mappingP)));
}
ri++;
}
idat->nBad = nalt;
idat->nGood = nref;
double nreads = idat->nReads;
if(nreads > 1000){
nreads = 1000;
}
aal = aal - log(pow(2,nreads)); // the normalization of the genotype likelihood
abl = abl - log(pow(2,nreads)); // this is causing underflow for really high depth.
bbl = bbl - log(pow(2,nreads));
#ifdef DEBUG
cerr << "Geno: genotype likelihoods before corrections: 0/0 , 0/1, 1/1 " << aal << " " << abl << " " << bbl << endl;
#endif
if(nref == 0){
aal = -255.0;
abl = -255.0;
}
if(nalt == 0){
abl = -255.0;
bbl = -255.0;
}
double max = aal;
genotype = "0/0";
idat->genotypeIndex = 0;
if(abl > max){
genotype = "0/1";
idat->genotypeIndex = 1;
(*totalAlt) += 1;
max = abl;
}
if(bbl > max){
genotype = "1/1";
idat->genotypeIndex = 2;
(*totalAlt) += 2;
}
if(idat->genotypeIndex != 0){
*relativeDepth += (idat->badFlag.size() / stats->avgD[fname]);
*totalAltGeno += 1;
}
idat->genotype = genotype;
idat->gls.push_back(aal);
idat->gls.push_back(abl);
idat->gls.push_back(bbl);
#ifdef DEBUG
cerr << genotype << "\t" << aal << "\t" << abl << "\t" << bbl << "\t" << nref << "\t" << endl;
#endif
return true;
}
bool checkN(string & s){
int sl = s.size();
int n = 0;
int i = 0;
if(sl < 20){
return true;
}
for(i = 0 ; i < 10; i++){
if(s[i] == 'N'){
n += 1;
}
}
for(i = sl - 1; i < sl - 10; i--){
if(s[i] == 'N'){
n += 1;
}
}
if(n > 5){
return true;
}
return false;
}
bool loadReadsIntoIndvs(map<string, indvDat*> & ti ,
readPileUp & pileup ,
global_opts & localOpts ,
insertDat & localDists ,
long int * pos ,
vector<int> & insertLength ,
vector<int> & outOfBounds ){
for(list<BamAlignment>::iterator r = pileup.currentData.begin(); r != pileup.currentData.end(); r++){
if((*r).Position > *pos){
continue;
}
if( ((*r).AlignmentFlag & 0x0800) != 0 || ! (*r).IsPrimaryAlignment()){
continue;
}
string fname = (*r).Filename;
int bad = 0;
vector< CigarOp > cd = (*r).CigarData;
if((( abs( (*r).Position - *pos ) < 5 )
|| ( abs( (*r).GetEndPosition(false,true) - *pos) < 5 ))
&& (cd.front().Type == 'S' || cd.back().Type == 'S') ){
if( ((pileup.primary[(*r).Position].size() > 1) || (pileup.primary[(*r).GetEndPosition(false,true)].size() > 1))){
bad = 1;
ti[fname]->nClipping++;
}
}
// if( ((pileup.primary[(*r).Position].size() > 1) || (pileup.primary[(*r).GetEndPosition(false,true)].size() > 1))
// && (cd.front().Type == 'S' || cd.back().Type == 'S') ){
// bad = 1;
// ti[fname]->nClipping++;
// }
if((*r).IsMapped() && (*r).IsMateMapped() && ((*r).RefID == (*r).MateRefID)){
ti[fname]->insertSum += abs(double((*r).InsertSize));
ti[fname]->mappedPairs += 1;
if(( (*r).IsReverseStrand() && (*r).IsMateReverseStrand() ) || ( !(*r).IsReverseStrand() && !(*r).IsMateReverseStrand() )){
bad = 1;
ti[fname]->sameStrand += 1;
}
double ilength = abs ( double ( (*r).InsertSize ));
double iDiff = abs ( ilength - localDists.mus[(*r).Filename] );
int test = 0;
if(iDiff > ( 2.5 * insertDists.sds[(*r).Filename] ) ){
test = 1;
outOfBounds.push_back( (*r).MatePosition );
insertLength.push_back( (*r).InsertSize );
bad = 1;
ti[fname]->nAboveAvg += 1;
ti[fname]->hInserts.push_back(ilength);
if( ilength < localDists.mus[(*r).Filename]){
pileup.mateTooClose++;
}
if( ilength > localDists.mus[(*r).Filename] ){
pileup.mateTooFar++;
}
}
#ifdef DEBUG
cerr << "IL: " << ilength << " " << iDiff << " " << 3.0 * insertDists.sds[(*r).Filename] << " " << test << endl;
#endif
}
ti[fname]->nReads++;
map<string, int>::iterator os = pileup.odd.find((*r).Name);
if(os != pileup.odd.end()){
bad = 1;
}
if(bad == 1){
ti[fname]->nBad += 1;
}
else{
ti[fname]->nGood += 1;
}
ti[fname]->badFlag.push_back( bad );
ti[fname]->MapQ.push_back((*r).MapQuality);
#ifdef DEBUG
ti[fname]->alignments.push_back(*r);
#endif
}
return true;
}
bool cleanUp( map < string, indvDat*> & ti, global_opts localOpts){
for(vector<string>::iterator all = localOpts.all.begin(); all != localOpts.all.end(); all++ ){
delete ti[*all];
}
return true;
}
bool loadInfoField(map<string, indvDat*> dat, info_field * info, global_opts & opts){
for(unsigned int b = 0; b < opts.backgroundBams.size(); b++){
if( dat[opts.backgroundBams[b]]->genotypeIndex == -1){
continue;
}
info->nat += 2 - dat[opts.backgroundBams[b]]->genotypeIndex;
info->nbt += dat[opts.backgroundBams[b]]->genotypeIndex;
info->tgc += 1;
}
for(unsigned int t = 0; t < opts.targetBams.size(); t++){
if(dat[opts.targetBams[t]]->genotypeIndex == -1){
continue;
}
info->nab += 2 - dat[opts.targetBams[t]]->genotypeIndex;
info->nbb += dat[opts.targetBams[t]]->genotypeIndex;
info->bgc += 1;
}
info->taf += info->nbt / (info->nat + info->nbt);
info->baf += info->nbb / (info->nab + info->nbb);
info->aaf += (info->nbt + info->nbb) / (info->nat + info->nbt + info->nab + info->nbb);
double alt = logLbinomial(info->nbt, (info->tgc * 2), info->taf) + logLbinomial(info->nbb, (2* info->bgc), info->baf);
double null = logLbinomial(info->nbt, (info->tgc * 2), info->aaf) + logLbinomial(info->nbb, (2* info->bgc), info->aaf);
info->lrt = 2 * (alt - null);
if(info->lrt < 0 ){
info->lrt = 0;
}
if(aminan(info->lrt)){
info->lrt = 0;
}
return true;
}
string infoText(info_field * info){
stringstream ss;
ss << "LRT=" << info->lrt << ";";
if(aminan(info->baf)){
ss << "WAF=" << info->taf << ",.," << info->aaf << ";";
}
else if(aminan(info->taf)){
ss << "WAF=" << ".," << info->baf << "," << info->aaf << ";";
}
else{
ss << "WAF=" << info->taf << "," << info->baf << "," << info->aaf << ";";
}
ss << "GC=" << info->tgc << "," << info->bgc << ";";
return ss.str();
}
bool burnCigar(string s, vector<cigar> & cigs){
unsigned int offset = 0;
unsigned int index = 0;
for(string::iterator it = s.begin() ; it != s.end(); it++){
switch(*it){
case 'M':
{
cigar c;
c.type = 'M';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
case 'I':
{
cigar c;
c.type = 'I';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
case 'D':
{
cigar c;
c.type = 'D';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
case 'N':
{
cigar c;
c.type = 'N';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
case 'S':
{
cigar c;
c.type = 'S';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
case 'H':
{
cigar c;
c.type = 'H';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
case 'P':
{
cigar c;
c.type = 'P';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
case 'X':
{
cigar c;
c.type = 'X';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
case '=':
{
cigar c;
c.type = '=';
c.length = atoi(s.substr(offset, index-offset).c_str());
cigs.push_back(c);
offset = index + 1;
break;
}
default:
break;
}
index += 1;
}
//cerr << "done burning" << endl;
return true;
}
void endPos(vector<cigar> & cigs, int * pos){
// MXDN= all consume reference
for(vector<cigar>::iterator it = cigs.begin();
it != cigs.end(); it++){
switch( (*it).type ){
case 'M':
{
*pos += (*it).length;
break;
}
case 'X':
{
*pos += (*it).length;
break;
}
case 'D':
{
*pos += (*it).length;
break;
}
case '=':
{
*pos += (*it).length;
break;
}
case 'N':
{
*pos += (*it).length;
break;
}
default:
break;
}
}
}
bool uniqClips(long int * pos,
map<long int, vector < BamAlignment > > & clusters,
vector<string> & alts, string & direction, global_opts & localOpts){
map<string, vector<string> > clippedSeqs;
int bcount = 0;
int fcount = 0;
for( vector < BamAlignment >::iterator it = clusters[(*pos)].begin();
it != clusters[(*pos)].end(); it++){
vector< CigarOp > cd = (*it).CigarData;
if((*it).Position == (*pos)){
string clip = (*it).QueryBases.substr(0, cd.front().Length);
if(clip.size() < 5){
continue;
}
int start = cd.front().Length - 5;
if(start < 0){
start = 0 ;
}
int sum = 0;
int num = 0;
string quals = (*it).Qualities.substr(start, 5);
#ifdef DEBUG
cerr << "clip seq : " << clip << endl;
cerr << "clip qual: " << quals << endl;
#endif
for(unsigned int q = 0 ; q < quals.size() ; q++){
sum += localOpts.qualLookup[ int(quals[q]) ] ;
num += 1;
}
#ifdef DEBUG
cerr << "clip avgQ: " << double(sum)/double(num) << endl;
#endif
if((double(sum)/double(num)) < localOpts.qualCut){
#ifdef DEBUG
cerr << "clip fail due to low qual" << endl;
#endif
continue;
}
clippedSeqs["f"].push_back(clip);
fcount += 1;
}
if((*it).GetEndPosition(false,true) == (*pos)){
string clip = (*it).QueryBases.substr( (*it).Length - cd.back().Length );
if(clip.size() < 5){
continue;
}
int start = (*it).Length - cd.back().Length;
int sum = 0;
int num = 0;
string quals = (*it).Qualities.substr(start, 5);
#ifdef DEBUG
cerr << "clip seq : " << clip << endl;
cerr << "clip qual: " << quals << endl;
#endif
for(unsigned int q = 0 ; q < quals.size() ; q++){
sum += localOpts.qualLookup[ int(quals[q]) ] ;
num += 1;
}
#ifdef DEBUG
cerr << "clip avgQ: " << double(sum)/double(num) <<endl;
#endif
if((double(sum)/double(num)) < localOpts.qualCut){
#ifdef DEBUG
cerr << "clip fail due to low qual" << endl;
#endif
continue;
}
clippedSeqs["b"].push_back(clip);
bcount += 1;
}
}
string key = "f";
if(bcount > fcount){
key = "b";
}
direction = key;
for(vector<string>::iterator seqs = clippedSeqs[key].begin();
seqs != clippedSeqs[key].end(); seqs++
){
alts.push_back(*seqs);
}
return true;
}
string consensus(vector<string> & s, double * nn, string & direction){
if(s.empty()){
return ".";
}
if(s.size() == 1){
return s[0];
}
stringstream con;
{
using namespace seqan;
typedef String< Dna > TSequence;
typedef Graph<Alignment<StringSet<TSequence, Dependent<> > > > TGraph;
StringSet<TSequence> seq;
int index = 0;
if(direction.compare("f") == 0){
for(uint clips = 0; clips < s.size(); clips++){
appendValue(seq, s[clips]);
if(clips > 19){
break;
}
}
}
else{
for(int clips = s.size() -1; clips > -1; clips--){
appendValue(seq, s[clips]);
index+=1;
if(index > 19){
break;
}
}
}
TGraph aliG(seq);
globalMsaAlignment(aliG, Blosum62(-1, -11));
String<char> align;
convertAlignment(aliG, align);
unsigned int nseq = length(seq);
unsigned int colLen = length(align) / nseq;
for(unsigned int z = 0 ; z < colLen; z++){
map<char, int> columnBases;
for(unsigned int s = 0; s < nseq; s++){
if(align[z + (s*colLen)] != gapValue<char>()){
columnBases[align[z + (s*colLen)]]++;
}
}
if(columnBases.size() == 1){
con << columnBases.begin()->first;
}
else{
*nn += 1;
con << "N";
}
}
#ifdef DEBUG
cerr << "seqAn alignment" << endl;
cerr << aliG << endl;
#endif
}
return con.str();
}
void gatherAlternativeAlignments(vector <int> & outBounds,
long int * lb ,
long int * ub ,
readPileUp & pileup ,
string & seqid ,
long int * pos ,
vector<string> & supports ){
string xaTag;
vector<string> XAhits;
for(list<BamAlignment>::iterator r = pileup.currentData.begin();
r != pileup.currentData.end(); r++){
if((*r).Position > *pos){
continue;
}
if(! (*r).IsPrimaryAlignment()){
continue;
}
if((*r).GetTag("XA", xaTag)){
vector<string> tmpHit = split(xaTag, ";");
for(vector<string>::iterator s = tmpHit.begin();
s != tmpHit.end(); ++s){
XAhits.push_back(*s);
}
}
}
for(vector<string>::iterator xa = XAhits.begin();
xa != XAhits.end(); ++xa){
// he uses a semi-colon on the last one :(
if((*xa).empty()){
continue;
}
vector<string> chimera = split((*xa), ",");
// we are looking for intra chrom
if(chimera[0].compare(seqid) != 0){
continue;
}
// remove the strand from XA
chimera[1].erase(0,1);
int chimPos = atoi( chimera[1].c_str() ) -1;
vector<cigar> c;
burnCigar(chimera[2], c);
if(c.front().type == 'S' && c.back().type == 'S'){
continue;
}
if(c.back().type == 'S'){
endPos(c, &chimPos);
chimPos = chimPos - 1;
}
supports.push_back("xa");
outBounds.push_back( chimPos );
#ifdef DEBUG
cerr << "gathering XA within CHR support cigar: " << *xa << endl;
#endif
}
}
void gatherSplitReadSupport(vector <int> & outBounds,
long int * lb ,
long int * ub ,
readPileUp & pileup ,
string & seqid ,
long int * pos ,
vector<string> & supports ){
vector<string> SAhits;
for(list<BamAlignment>::iterator r = pileup.currentData.begin();
r != pileup.currentData.end(); r++){
if((*r).Position > *pos){
continue;
}
if(! (*r).IsPrimaryAlignment()){
continue;
}
string saTag;
if((*r).GetTag("SA", saTag)){
vector<string> tmpHit = split(saTag, ";");
for(vector<string>::iterator s = tmpHit.begin();
s != tmpHit.end(); ++s){
SAhits.push_back(*s);
}
}
}
for(vector<string>::iterator sa = SAhits.begin();
sa != SAhits.end(); sa++){
// he uses a semi-colon on the last one :(
if((*sa).empty()){
continue;
}
vector<string> chimera = split((*sa), ",");
if(chimera[0].compare(seqid) != 0){
continue;
}
#ifdef DEBUG
cerr << "gathering within CHR support cigar: " << *sa << endl;
#endif
vector<cigar> c;
burnCigar(chimera[3], c);
int chimPos = atoi( chimera[1].c_str() ) -1;
if(c.front().type == 'S' && c.back().type == 'S'){
continue;
}
if(c.back().type == 'S'){
endPos(c, &chimPos);
chimPos = chimPos -1;
}
supports.push_back("sr");
outBounds.push_back( chimPos );
}
}
bool intraChromosomeSvEnd( vector <int> & inBounds ,
vector <int> & outOfBounds,
int supports [] ,
readPileUp & totalDat ,
string & seqid ,
string & chr2 ,
long int * pos ,
long int & setPos ,
long int & hi ,
long int & lo ,
FastaReference & RefSeq ,
string & altSeq ){
#ifdef DEBUG
cerr << "hunting for intra chromosomal end" << endl;
cerr << " n out: " << outOfBounds.size() << endl;
cerr << " n inb: " << inBounds.size() << endl;
#endif
vector<string> supportType;
for(unsigned int st = 0; st < outOfBounds.size(); st++){
supportType.push_back("mp");
}
unsigned int nz = 0;
for(vector<int>::iterator nzit = inBounds.begin();
nzit != inBounds.end(); ++nzit){
#ifdef DEBUG
cerr << "ILOB: " << *nzit << endl;
#endif
if(*nzit == 0){
nz += 1;
}
}
#ifdef DEBUG
cerr << "NZ: " << nz << endl;
#endif
#ifdef DEBUG
cerr << "About to gather target zone: " << endl;
for(vector<int>::iterator pi = outOfBounds.begin();
pi != outOfBounds.end(); pi++){
cerr << "OB: " << *pi << endl;
}
#endif
int targetZone = *pos ;
if(outOfBounds.size() > 1){
targetZone = mean(outOfBounds);
}
string endChunk = RefSeq.getSubSequence(chr2, targetZone-300, 600);
// Declares a default Aligner
StripedSmithWaterman::Aligner aligner;
// Declares a default filter
StripedSmithWaterman::Filter filter;
// Declares an alignment that stores the result
StripedSmithWaterman::Alignment alignment;
// Aligns the query to the ref
// aligner.Align(query.c_str(), ref.c_str(), ref.size(), filter, &alignment);
aligner.Align(altSeq.c_str(), endChunk.c_str(), endChunk.size(), filter, &alignment);
if(abs((targetZone - 500 + alignment.ref_begin)-targetZone) < 500){
#ifdef DEBUG
cerr << "targetZone before switch: " << targetZone << endl;
#endif
targetZone = (targetZone - 500 + alignment.ref_begin);
#ifdef DEBUG
cerr << "targetZone after switch: " << targetZone << endl;
#endif
}
long int lowerBoundOut = -1;
long int upperBoundOut = -1;
#ifdef DEBUG
cerr << "hunting for intra chromosomal end out of bounds" << endl;
#endif
lowerBoundOut = 0 ;
upperBoundOut = 0 ;
gatherSplitReadSupport(outOfBounds, &lowerBoundOut,
&upperBoundOut, totalDat, chr2, pos, supportType);
gatherAlternativeAlignments(outOfBounds,&lowerBoundOut,
&upperBoundOut, totalDat, chr2, pos, supportType);
if(outOfBounds.empty()){
return false;
}
#ifdef DEBUG
for(vector<int>::iterator test = outOfBounds.begin();
test != outOfBounds.end(); ++test){
cerr << "PP pos:" << *test << endl;
}
#endif
// loading all possible breakpoints into a map.
map<int, int> posCounts;
for(vector<int>::iterator pi = outOfBounds.begin();
pi != outOfBounds.end(); pi++){
if(posCounts.find(RoundNum(*pi)) == posCounts.end()){
posCounts[RoundNum(*pi)] = 1;
}
else{
posCounts[RoundNum(*pi)] += 1;
}
}
map<int, int> RawPosCounts;
for(vector<int>::iterator pi = outOfBounds.begin();
pi != outOfBounds.end(); pi++){
if(RawPosCounts.find(*pi) == RawPosCounts.end()){
RawPosCounts[*pi] = 1;
}
else{
RawPosCounts[*pi] += 1;
}
}
#ifdef DEBUG
cerr << "targetZone: " << targetZone << endl;
#endif
int bestPos = 0;
int maxCount = 0;
for(map<int, int>::iterator ci = posCounts.begin();
ci != posCounts.end(); ci++){
#ifdef DEBUG
cerr << "breakpoint: " << ci->first << " " << ci->second << " " << (int(targetZone) - ci->first) << endl;
#endif
if(ci->second > maxCount || (ci->second == maxCount
&& abs(targetZone - ci->first) < abs(bestPos - targetZone)) ){
bestPos = ci->first;
maxCount = ci->second;
}
}
if(maxCount < 2){
for(map<int, int>::iterator ci = posCounts.begin();
ci != posCounts.end(); ci++){
if(abs(ci->first - targetZone) < abs(bestPos - targetZone)){
bestPos = ci->first;
}
}
}
vector<double> trimmed;
#ifdef DEBUG
cerr << "bestPos: " << bestPos << endl;
#endif
if(bestPos != 0){
map<string, int> supportStrings;
supportStrings["mp"] = 0;
supportStrings["xa"] = 0;
supportStrings["sr"] = 0;
int supportVectorIndex = 0;
int tmpNewBestPos = 0;
for(vector<int>::iterator tr = outOfBounds.begin();
tr != outOfBounds.end(); ++tr){
if(abs(bestPos - tmpNewBestPos) > abs(bestPos - *tr) ){
tmpNewBestPos = *tr;
}
if(abs(bestPos - *tr) < 150){
trimmed.push_back(double(*tr));
supportStrings[supportType[supportVectorIndex]] += 1;
}
supportVectorIndex += 1;
}
bestPos = tmpNewBestPos;
supports[0] = supportStrings["mp"];
supports[1] = supportStrings["sr"];
supports[2] = supportStrings["xa"];
int bh = 0;
int nbp = bestPos;
for(map<int,int>::iterator rc = RawPosCounts.begin(); rc != RawPosCounts.end(); rc++ ){
#ifdef DEBUG
cerr << "hone pos:" << rc->first << " " << rc->second << endl;
#endif
if(abs(rc->first - bestPos) > 20){
continue;
}
if(rc->second > bh){
bh = rc->second;
nbp = rc->first;
}
}
bestPos = nbp;
double mu = mean(trimmed);
double sd = sqrt(var(trimmed, mu));
if(trimmed.size() > 1){
hi = bestPos + int(1.96*sd);
lo = bestPos - int(1.96*sd);
}
else{
hi = bestPos;
lo = bestPos;
}
setPos = int(bestPos) + 1;
}
if(nz > trimmed.size()){
setPos = int(*pos) +1;
hi = int(*pos) +1;
lo = int(*pos) +1;
}
return true;
}
void interChromosomeSvEnd(readPileUp & totalDat,
string & seqid,
string & chr2,
long int * pos,
vector<int> & positions,
vector<RefData> & seqnames ){
map<string, int> alternativeChrCounts;
alternativeChrCounts[seqid] = positions.size();
for(list<BamAlignment>::iterator it = totalDat.currentData.begin();
it != totalDat.currentData.end(); it++){
if(! (*it).IsPrimaryAlignment()
&& (((*it).AlignmentFlag & 0x0800) == 0)){
continue;
}
if(! (*it).IsMateMapped()){
continue;
}
if((*it).Position > *pos){
continue;
}
string chr2seqid = seqnames[(*it).MateRefID].RefName;
if(chr2seqid.compare(seqid) != 0){
if(alternativeChrCounts.find(chr2seqid) != alternativeChrCounts.end()){
alternativeChrCounts[chr2seqid] += 1;
}
else{
alternativeChrCounts[chr2seqid] = 1;
}
}
vector<string> SAhits;
string saTag;
if((*it).GetTag("SA", saTag)){
vector<string> tmpHit = split(saTag, ";");
for(vector<string>::iterator s = tmpHit.begin();
s != tmpHit.end(); ++s){
SAhits.push_back(*s);
}
}
for(vector<string>::iterator sah = SAhits.begin();
sah != SAhits.end(); sah++){
if((*sah).empty()){
continue;
}
vector<string> saDat = split(*sah, ",");
if(alternativeChrCounts.find(saDat[0]) != alternativeChrCounts.end()){
alternativeChrCounts[saDat[0]] += 1;
}
else{
alternativeChrCounts[saDat[0]] = 1;
}
}
// vector<string> XAhits;
// string xaTag;
//
// if((*it).GetTag("XA", xaTag)){
// vector<string> tmpHit = split(xaTag, ";");
// for(vector<string>::iterator s = tmpHit.begin();
// s != tmpHit.end(); ++s){
// XAhits.push_back(*s);
// }
// }
//
// for(vector<string>::iterator xah = XAhits.begin();
// xah != XAhits.end(); xah++){
//
// if((*xah).empty()){
// continue;
// }
// vector<string> xaDat = split(*xah, ",");
// if(alternativeChrCounts.find(xaDat[0]) != alternativeChrCounts.end()){
// alternativeChrCounts[xaDat[0]] += 1;
// }
// else{
// alternativeChrCounts[xaDat[0]] = 1;
// }
// }
//
}
string bestChr2;
int Chr2Count = 0;
for(map<string,int>::iterator ac = alternativeChrCounts.begin();
ac != alternativeChrCounts.end(); ac++){
#ifdef DEBUG
cerr << "otherseqids: " << ac->first << " " << ac->second << endl;
#endif
if(ac->second > Chr2Count){
bestChr2 = ac->first;
Chr2Count = ac->second;
}
}
if(alternativeChrCounts[seqid] == Chr2Count){
bestChr2 = seqid;
}
if(seqid.compare(bestChr2) != 0){
chr2 = bestChr2;
int seqidIndx = 0;
while(bestChr2.compare(seqnames[seqidIndx].RefName) != 0){
seqidIndx+=1;
}
positions.clear();
for(vector<BamAlignment>::iterator it = totalDat.primary[*pos].begin();
it != totalDat.primary[*pos].end(); it++){
if(! (*it).IsPrimaryAlignment()
&& (((*it).AlignmentFlag & 0x0800) == 0)){
continue;
}
if(! (*it).IsMateMapped()){
continue;
}
if((*it).MateRefID != seqidIndx ){
continue;
}
else{
positions.push_back((*it).MatePosition);
}
}
}
}
bool score(string seqid ,
long int * pos ,
readPileUp & totalDat ,
insertDat & localDists,
string & results ,
global_opts & localOpts ,
vector<uint64_t> & kmerDB ,
vector<RefData> & seqnames ,
int & offset ,
string & RefChunk ,
FastaReference & RefSeq ){
totalDat.processPileup(pos);
if(totalDat.primary[*pos].size() < localOpts.min &&
totalDat.supplement[*pos].size() < 2
){
#ifdef DEBUG
cerr << "left scoring because there were too few soft clips" << endl;
#endif
return true;
}
if(totalDat.nDiscordant == 0
&& totalDat.nsplitRead == 0
&& totalDat.evert == 0
&& totalDat.primary[*pos].size() < globalOpts.min){
#ifdef DEBUG
cerr << "left scoring because there was no discordant, no splits, no everts" << endl;
#endif
return true;
}
// if((double(totalDat.nLowMapQ) / double(totalDat.numberOfReads)) == 1){
//#ifdef DEBUG
// cerr << "left scoring because mapping quality was way too low" << endl;
//#endif
// return true;
// }
double cf = double(totalDat.tooManyCigs) / double(totalDat.numberOfReads);
stringstream attributes;
attributes << "AT="
<< double(totalDat.nPaired) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.nDiscordant) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.nMatesMissing) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.nSameStrand) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.nCrossChr) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.nsplitRead) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.nf1SameStrand) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.nf2SameStrand) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.nf1f2SameStrand) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.internalInsertion) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.internalDeletion) / double(totalDat.numberOfReads);
// finding consensus sequence
vector<string> alts ; // pairBreaks;
string direction ;
uniqClips(pos, totalDat.primary, alts, direction, localOpts);
if(alts.size() < 2){
#ifdef DEBUG
cerr << "returned too few alts" << endl;
#endif
return true;
}
double nn = 0;
string altSeq = consensus(alts, &nn, direction);
if(altSeq.size() < 10){
#ifdef DEBUG
cerr << "alt size too short" << endl;
#endif
return true;
}
if(nn / double(altSeq.size()) > 0.50 ){
#ifdef DEBUG
cerr << "too many N" << endl;
#endif
return true;
}
map < string, indvDat*> ti;
for(unsigned int t = 0; t < localOpts.all.size(); t++){
indvDat * i;
i = new indvDat;
initIndv(i);
ti[localOpts.all[t]] = i;
}
vector<int> outOfbounds ;
vector<int> insertLength ;
loadReadsIntoIndvs(ti, totalDat,
localOpts, localDists,
pos, insertLength, outOfbounds);
#ifdef DEBUG
cerr << "loaded reads into indvs" << endl;
cerr << " inbound : " << outOfbounds.size() << endl;
cerr << " outbound: " << insertLength.size() << endl;
#endif
string esupport=".";
string bestEnd= ".";
string chr2 = seqid;
long int SVLEN = 0;
long int otherBreakPointPos = 0;
long int ehigh = 0;
long int elow = 0;
long int shigh = *pos;
long int slow = *pos;
// mp,sr,sa
int supports[3] = {0,0,0};
interChromosomeSvEnd(totalDat, seqid, chr2, pos, outOfbounds, seqnames);
if(intraChromosomeSvEnd(insertLength,
outOfbounds,
supports,
totalDat,
seqid,
chr2,
pos,
otherBreakPointPos,
ehigh,
elow,
RefSeq,
altSeq
)){
SVLEN = abs( (*pos) - otherBreakPointPos ) ;
if(seqid.compare(chr2) != 0){
SVLEN = 0;
}
}
// string localChunk = RefSeq.getSubSequence(seqid, (*pos-200),400);
#ifdef DEBUG
cerr << "substr: " << seqid << " " << *pos << " " << otherBreakPointPos << endl;
cerr << "offset: " << offset - 500 << " " << RefChunk.size() << endl;
#endif
string localChunk = RefChunk.substr(offset-200, 400);
string endChunk = RefSeq.getSubSequence(chr2, otherBreakPointPos-200, 400);
#ifdef DEBUG
cerr << "substrok: " << seqid << " " << *pos << " " << otherBreakPointPos << endl;
#endif
// Declares a default Aligner
StripedSmithWaterman::Aligner aligner;
// Declares a default filter
StripedSmithWaterman::Filter filter;
// Declares an alignment that stores the result
StripedSmithWaterman::Alignment alignmentEnd ;
StripedSmithWaterman::Alignment alignment;
// Aligns the query to the ref
// aligner.Align(query.c_str(), ref.c_str(), ref.size(), filter, &alignment);
aligner.Align(altSeq.c_str(), localChunk.c_str(), localChunk.size(), filter, &alignment);
aligner.Align(altSeq.c_str(), endChunk.c_str(), endChunk.size(), filter, &alignmentEnd);
#ifdef DEBUG
cerr << "local Pos SSW: " << (*pos - 200 + alignment.ref_begin) << " " << alignment.sw_score << endl;
cerr << "end Pos SSW : " << (otherBreakPointPos - 200 + alignmentEnd.ref_begin) << " " << alignmentEnd.sw_score << endl;
#endif
// cerr << "===== SSW result =====" << endl;
// cerr << "Best Smith-Waterman score:\t" << alignment.sw_score << endl
// << "Next-best Smith-Waterman score:\t" << alignment.sw_score_next_best << endl
// << "Reference start:\t" << alignment.ref_begin << endl
// << "Reference end:\t" << alignment.ref_end << endl
// << "Query start:\t" << alignment.query_begin << endl
// << "Query end:\t" << alignment.query_end << endl
// << "Next-best reference end:\t" << alignment.ref_end_next_best << endl
// << "Number of mismatches:\t" << alignment.mismatches << endl
// << "Cigar: " << alignment.cigar_string << endl
// << "current seqid: " << seqid << endl
// << "best seqid2 : " << chr2 << endl
// << "best Pos2 : " << otherBreakPointPos << endl
// << "con : " << altSeq << endl
// << "Pos1: " << *pos << " " << (*pos - 200 + alignment.ref_begin) << endl;
// cerr << "======================" << endl;
string reportRef = "N";
if(alignment.mismatches < 5 &&
totalDat.internalDeletion > 0 &&
totalDat.internalDeletion > totalDat.internalInsertion){
otherBreakPointPos = (*pos - 200 + alignment.ref_begin);
elow = otherBreakPointPos -10;
ehigh = otherBreakPointPos +10;
if(seqid.compare(chr2) == 0){
if(otherBreakPointPos >= *pos){
SVLEN = abs(*pos - otherBreakPointPos)-1;
reportRef = RefChunk.substr(offset, SVLEN+1);
altSeq = RefChunk.substr(offset, 1);
}
else{
otherBreakPointPos = 0;
}
}
}
if(alignment.mismatches < 5 &&
alignment.query_begin > 0 &&
totalDat.internalInsertion > 0 &&
totalDat.internalDeletion < totalDat.internalInsertion){
otherBreakPointPos = (*pos - 200 + alignment.ref_begin);
elow = otherBreakPointPos -10;
ehigh = otherBreakPointPos +10;
if(seqid.compare(chr2) == 0){
if(otherBreakPointPos >= *pos){
SVLEN = alignment.query_begin ;
reportRef = RefChunk.substr(offset-1, 2);
altSeq = reportRef + altSeq.substr(0, alignment.query_begin);
}
else{
otherBreakPointPos = 0;
}
}
}
// if(seqid.compare(chr2) == 0){
// if(*pos > otherBreakPointPos){
//
// #ifdef DEBUG
// cerr << "left scoring: start is upstream" << endl;
//#endif
//
// return true;
// }
// }
if(SVLEN > 1000000 && supports[0] == 0 && supports[1] == 0 ){
#ifdef DEBUG
cerr << "SV too large for zero support" << endl;
cerr << "mp: " << supports[0] << " sr: " << supports[1] << " xa: " << supports[2] << endl;
#endif
return true;
}
if(otherBreakPointPos == 0 ){
#ifdef DEBUG
cerr << "other breakpoint was not found" << endl;
#endif
return true;
}
vector<double> allPrimaryClippingPos;
for(std::map <long int, std::vector<BamTools::BamAlignment> >::iterator iz = totalDat.primary.begin();
iz != totalDat.primary.end(); ++iz){
if(abs(int(*pos) - int(iz->first)) > 50){
continue;
}
for(unsigned int i = 0; i < iz->second.size(); ++i){
allPrimaryClippingPos.push_back(double(iz->first));
}
}
if(allPrimaryClippingPos.size() > 1){
double mu = mean(allPrimaryClippingPos);
double sd = sqrt(var(allPrimaryClippingPos, mu));
shigh += int(1.96 * sd) + 1;
slow -= int(1.96 * sd) + 1;
}
if(supports[0] > 0 || supports[1] > 0 || supports[2] > 0){
stringstream sp;
sp << supports[0] << "," << supports[1] << "," << supports[2] ;
esupport = sp.str();
}
double nAlt = 0;
double nAltGeno = 0;
double alternative_relative_depth_sum = 0;
for(unsigned int t = 0; t < localOpts.all.size(); t++){
processGenotype(localOpts.all[t],
ti[localOpts.all[t]],
&nAlt,
&nAltGeno,
&alternative_relative_depth_sum ,
&localDists);
#ifdef DEBUG
cerr << "position: " << *pos << endl;
cerr << printIndvDat(ti[localOpts.all[t]]) << endl;
#endif
}
if(altSeq.size() < 5){
cleanUp(ti, localOpts);
return true;
#ifdef DEBUG
cerr << "left scoring because alt seq is less than 5" << endl;
#endif
}
if(nAlt == 0 ){
cleanUp(ti, localOpts);
#ifdef DEBUG
cerr << "left scoring because no alt genotypes" << endl;
#endif
return true;
}
attributes << ","
<< double(totalDat.mateTooClose) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.mateTooFar) / double(totalDat.numberOfReads)
<< ","
<< double(totalDat.evert) / double(totalDat.numberOfReads)
<< ","
<< (alternative_relative_depth_sum/nAltGeno) << ";";
info_field * info = new info_field;
initInfo(info);
loadInfoField(ti, info, localOpts);
string infoToPrint = infoText(info);
infoToPrint.append(attributes.str());
stringstream tmpOutput;
tmpOutput << seqid << "\t" ; // CHROM
tmpOutput << (*pos) +1 << "\t" ; // POS
tmpOutput << "." << "\t" ; // ID
tmpOutput << reportRef << "\t" ; // REF
tmpOutput << altSeq << "\t" ; // ALT
tmpOutput << "." << "\t" ; // QUAL
tmpOutput << "." << "\t" ; // FILTER
tmpOutput << infoToPrint ;
tmpOutput << "CF=" << cf << ";" ;
tmpOutput << "CISTART=" << slow << "," << shigh << ";" ;
tmpOutput << "CIEND=" << elow << "," << ehigh << ";" ;
tmpOutput << "PU=" << totalDat.primary[*pos].size() << ";" ;
tmpOutput << "SU=" << totalDat.supplement[*pos].size() << ";" ;
tmpOutput << "CU=" << totalDat.primary.size() + totalDat.supplement.size() << ";" ;
tmpOutput << "RD=" << totalDat.numberOfReads << ";" ;
tmpOutput << "NC=" << alts.size() << ";" ;
tmpOutput << "MQ=" << double(totalDat.mapQsum) / double(totalDat.numberOfReads) << ";";
tmpOutput << "MQF=" << double(totalDat.nLowMapQ) / double(totalDat.numberOfReads) << ";";
if(esupport.empty() || bestEnd.empty() || (unsigned int)bestEnd.c_str()[0] == 0){
bestEnd = "." ;
esupport = "." ;
}
tmpOutput << "SP=" << esupport << ";";
tmpOutput << "CHR2=" << chr2 << ";";
tmpOutput << "DI=" << direction << ";";
if(otherBreakPointPos == 0 ){
tmpOutput << "END=.;SVLEN=.\t";
}
else{
tmpOutput << "END=" << otherBreakPointPos << ";" << "SVLEN=" << SVLEN << "\t";
}
tmpOutput << "GT:GL:NR:NA:NS:RD" << "\t" ;
int enrichment = 0;
for(unsigned int t = 0; t < localOpts.all.size(); t++){
if(ti[localOpts.all[t]]->nBad > 1){
enrichment = 1;
}
tmpOutput << ti[localOpts.all[t]]->genotype
<< ":" << ti[localOpts.all[t]]->gls[0]
<< "," << ti[localOpts.all[t]]->gls[1]
<< "," << ti[localOpts.all[t]]->gls[2]
<< ":" << ti[localOpts.all[t]]->nGood
<< ":" << ti[localOpts.all[t]]->nBad
<< ":" << ti[localOpts.all[t]]->nClipping
<< ":" << ti[localOpts.all[t]]->nReads ;
if(t < localOpts.all.size() - 1){
tmpOutput << "\t";
}
}
tmpOutput << endl;
if(enrichment == 0 ){
cleanUp(ti, localOpts);
#ifdef DEBUG
cerr << "left scoring: no enrichment" << endl;
#endif
return true;
}
#ifdef DEBUG
cerr << "line: " << tmpOutput.str();
#endif
results.append(tmpOutput.str());
cleanUp(ti, localOpts);
delete info;
return true;
}
bool filter(BamAlignment & al){
if(!al.IsMapped()){
return false;
#ifdef DEBUG
// cerr << "failed not mapped " << al.Name << " " << endl;
#endif
}
if(al.IsDuplicate()){
return false;
#ifdef DEBUG
// cerr << "failed duplicate read " << al.Name << " " << endl;
#endif
}
if(! al.IsPrimaryAlignment()
&& ((al.AlignmentFlag & 0x0800) == 0)){
#ifdef DEBUG
// cerr << "failed not primary not split " << al.Name << " " << endl;
#endif
return false;
}
string saTag;
if(al.GetTag("SA", saTag)){
}
else{
if(al.MapQuality < 1){
#ifdef DEBUG
// cerr << "failed mapping quality too low" << al.Name << " " << endl;
#endif
return false;
}
}
string xaTag;
if(al.GetTag("XA", xaTag)){
vector<string> xas = split(xaTag, ";");
if(xas.size() > 5){
#ifdef DEBUG
// cerr << "failed xa filter" << al.Name << " " << xaTag << endl;
#endif
return false;
}
}
if(checkN(al.QueryBases)){
return false;
}
return true;
}
bool runRegion(int seqidIndex,
int start,
int end,
vector< RefData > seqNames,
vector<uint64_t> kmerDB){
string regionResults;
omp_set_lock(&lock);
if(seqNames[seqidIndex].RefLength < 2000){
cerr << "WARNING: " << seqNames[seqidIndex].RefName << " is too short to assay: " << seqNames[seqidIndex].RefLength << endl;
omp_unset_lock(&lock);
return false;
}
if((start - 500) < 0){
start = 500;
}
if((end + 500) > seqNames[seqidIndex].RefLength){
end = seqNames[seqidIndex].RefLength -500;
}
global_opts localOpts = globalOpts;
memcpy(localOpts.qualLookup, SangerLookup, 126*sizeof(int));
insertDat localDists = insertDists;
FastaReference RefSeq;
RefSeq.open(localOpts.fasta);
string refChunk = RefSeq.getSubSequence(seqNames[seqidIndex].RefName, start-500, (end-start)+500);
omp_unset_lock(&lock);
BamMultiReader All;
prepBams(All, "all");
if(!All.SetRegion(seqidIndex, start, seqidIndex, end)){
return false;
}
BamAlignment al ;
readPileUp allPileUp;
bool hasNextAlignment = true;
hasNextAlignment = All.GetNextAlignment(al);
if(!hasNextAlignment){
All.Close();
return false;
}
list <long int> clippedBuffer;
long int currentPos = -1;
while(hasNextAlignment || ! clippedBuffer.empty()){
while(currentPos >= clippedBuffer.front()){
if(clippedBuffer.empty()){
break;
}
clippedBuffer.pop_front();
}
while(clippedBuffer.empty()){
hasNextAlignment = All.GetNextAlignment(al);
#ifdef DEBUG
// cerr << "clipping buffer empty " << al.Name << " " << al.Position << endl;
#endif
if(!hasNextAlignment){
break;
}
if(!filter(al)){
continue;
}
vector< CigarOp > cd = al.CigarData;
if(cd.front().Type == 'S' && cd.front().Length > 9 && al.MapQuality > localOpts.MQ){
clippedBuffer.push_back(al.Position);
}
if(cd.back().Type == 'S' && cd.back().Length > 9 && al.MapQuality > localOpts.MQ){
clippedBuffer.push_back(al.GetEndPosition(false,true));
}
allPileUp.processAlignment(al);
}
clippedBuffer.sort();
#ifdef DEBUG
// cerr << "clipping buffer not empty; front: " << clippedBuffer.front() << "back: " << clippedBuffer.back() << endl;
#endif
while(al.Position <= clippedBuffer.front()){
hasNextAlignment = All.GetNextAlignment(al);
#ifdef DEBUG
// cerr << "clipping not buffer empty " << al.Name << " " << al.Position << endl;
#endif
if(!hasNextAlignment){
break;
}
if(!filter(al)){
continue;
}
vector< CigarOp > cd = al.CigarData;
if(cd.front().Type == 'S' && cd.front().Length > 9 && al.MapQuality > 5){
clippedBuffer.push_back(al.Position);
}
if(cd.back().Type == 'S' && cd.back().Length > 9 && al.MapQuality > 5){
clippedBuffer.push_back(al.GetEndPosition(false,true));
}
allPileUp.processAlignment(al);
}
clippedBuffer.sort();
#ifdef DEBUG
cerr << "About to score : " << currentPos << endl;
#endif
allPileUp.purgePast( ¤tPos );
int offset = currentPos - start + 500;
#ifdef DEBUG
cerr << "cp: " << currentPos << " st: " << start << " st+: " << start + 200 << " of: " << offset << endl;
#endif
if(! score(seqNames[seqidIndex].RefName,
¤tPos,
allPileUp,
localDists,
regionResults,
localOpts,
kmerDB,
seqNames,
offset,
refChunk,
RefSeq
)){
cerr << "FATAL: problem during scoring" << endl;
cerr << "FATAL: wham exiting" << endl;
exit(1);
}
currentPos = clippedBuffer.front();
if(regionResults.size() > 100000){
omp_set_lock(&lock);
cout << regionResults;
omp_unset_lock(&lock);
regionResults.clear();
}
}
omp_set_lock(&lock);
cout << regionResults;
omp_unset_lock(&lock);
regionResults.clear();
All.Close();
return true;
}
//bool loadKmerDB(vector<uint64_t> & DB){
//
// ifstream kmerDB (globalOpts.mask.c_str());
// string line;
//
// uint64_t kmer;
//
// if(kmerDB.is_open()){
// while(getline(kmerDB, line)){
// kmer = strtoull(line.c_str());
// DB.push_back(kmer);
// }
// }
// else{
// return false;
// }
//
// kmerDB.close();
// return true;
//}
bool loadBed(vector<regionDat*> & features, RefVector seqs){
map<string, int> seqidToInt;
int index = 0;
for(vector< RefData >::iterator sit = seqs.begin(); sit != seqs.end(); sit++){
seqidToInt[ (*sit).RefName ] = index;
index+=1;
}
ifstream featureFile (globalOpts.bed.c_str());
string line;
if(featureFile.is_open()){
while(getline(featureFile, line)){
vector<string> region = split(line, "\t");
int start = atoi(region[1].c_str()) ;
int end = atoi(region[2].c_str()) ;
regionDat * r = new regionDat;
r->seqidIndex = seqidToInt[region[0]];
r->start = start;
r->end = end ;
features.push_back(r);
}
}
else{
return false;
}
featureFile.close();
return true;
}
int main(int argc, char** argv) {
#ifdef DEBUG
cerr << "INFO: WHAM is in debug mode" << endl;
#endif
omp_init_lock(&lock);
srand((unsigned)time(NULL));
globalOpts.nthreads = -1;
parseOpts(argc, argv);
if(globalOpts.nthreads == -1){
}
else{
omp_set_num_threads(globalOpts.nthreads);
}
//loading up filenames into a vector
globalOpts.all.reserve(globalOpts.targetBams.size() + globalOpts.backgroundBams.size() );
globalOpts.all.insert( globalOpts.all.end(), globalOpts.targetBams.begin(), globalOpts.targetBams.end() );
globalOpts.all.insert( globalOpts.all.end(), globalOpts.backgroundBams.begin(), globalOpts.backgroundBams.end() );
// loading kmer database
vector<uint64_t> kmerDB;
// if(! (globalOpts.mask.compare("NA") == 0)){
// if(!loadKmerDB(kmerDB)){
// cerr << "FATAL: masking file was specified, but could not be opened or read." << endl;
// exit(1);
// }
// }
cerr << "INFO: gathering stats for each bam file." << endl;
#pragma omp parallel for schedule(dynamic, 3)
for(unsigned int i = 0; i < globalOpts.all.size(); i++){
grabInsertLengths(globalOpts.all[i]);
}
// the pooled reader
BamMultiReader allReader;
// grabbing sam header and checking for sotrted bams
prepBams(allReader, "all");
SamHeader SH = allReader.GetHeader();
if(!SH.HasSortOrder()){
cerr << "FATAL: sorted bams must have the @HD SO: tag in each SAM header." << endl;
exit(1);
}
allReader.Close();
prepBams(allReader, "all");
RefVector sequences = allReader.GetReferenceData();
allReader.Close();
printHeader();
int seqidIndex = 0;
vector< regionDat* > regions;
if(globalOpts.region.size() == 2){
for(vector< RefData >::iterator sit = sequences.begin(); sit != sequences.end(); sit++){
if((*sit).RefName == globalOpts.seqid){
break;
}
seqidIndex += 1;
}
}
if(globalOpts.region.size() == 2){
int start = globalOpts.region[0];
int end = globalOpts.region[1];
int p = start;
int e = 0;
for(; (p+1000000) <= end; p += 1000000){
regionDat * regionInfo = new regionDat;
regionInfo->seqidIndex = seqidIndex ;
regionInfo->start = p ;
regionInfo->end = 1000000 + p ;
regions.push_back(regionInfo);
e = p + 1000000;
}
if(e < end){
regionDat * regionInfo = new regionDat;
regionInfo->seqidIndex = seqidIndex ;
regionInfo->start = p ;
regionInfo->end = end ;
regions.push_back(regionInfo) ;
}
}
if(globalOpts.bed.compare("NA") == 0 && globalOpts.region.size() != 2){
for(vector< RefData >::iterator sit = sequences.begin(); sit != sequences.end(); sit++){
int start = 500;
if((*sit).RefLength < 2000){
cerr << "WARNING: " << (*sit).RefName << " is too short for WHAM-BAM: " << (*sit).RefLength << endl;
continue;
}
for(;start < ( (*sit).RefLength - 500) ; start += 1000000){
regionDat * chunk = new regionDat;
chunk->seqidIndex = seqidIndex;
chunk->start = start;
chunk->end = start + 1000000 ;
regions.push_back(chunk);
}
regionDat * lastChunk = new regionDat;
lastChunk->seqidIndex = seqidIndex;
lastChunk->start = start;
lastChunk->end = (*sit).RefLength;
seqidIndex += 1;
if(start < (*sit).RefLength){
regions.push_back(lastChunk);
}
}
}
else{
loadBed(regions, sequences);
}
#pragma omp parallel for schedule(dynamic, 3)
for(unsigned int re = 0; re < regions.size(); re++){
omp_set_lock(&lock);
cerr << "INFO: running region: " << sequences[regions[re]->seqidIndex].RefName << ":" << regions[re]->start << "-" << regions[re]->end << endl;
omp_unset_lock(&lock);
if(! runRegion( regions[re]->seqidIndex, regions[re]->start, regions[re]->end, sequences, kmerDB)){
omp_set_lock(&lock);
cerr << "WARNING: region failed to run properly: "
<< sequences[regions[re]->seqidIndex].RefName
<< ":" << regions[re]->start << "-"
<< regions[re]->end
<< endl;
omp_unset_lock(&lock);
}
}
cerr << "INFO: WHAM-BAM finished normally." << endl;
return 0;
}