src/lib/dataStructures.hpp
#ifndef DATASTRUCTURES_H
#define DATASTRUCTURES_H
#include <string>
#include <vector>
#include <map>
#include <string>
#include "api/BamMultiReader.h"
#include "fastahack/Fasta.h"
#include "ssw_cpp.h"
struct regionDat{
int seqidIndex ;
int start ;
int end ;
};
struct readPair{
int flag;
int count;
BamTools::BamAlignment al1;
BamTools::BamAlignment al2;
};
struct cigDat{
int Length;
char Type;
};
struct saTag{
int seqid;
int pos;
bool strand;
int mapQ;
int NM;
std::vector<cigDat> cig;
};
struct node;
struct edge;
struct edge{
node * L;
node * R;
std::map<char,int> support;
};
struct node{
int seqid ;
int pos ;
std::vector <edge *> eds;
std::map<std::string, int> sm;
};
struct graph{
std::map< int, std::map<int, node *> > nodes;
std::vector<edge *> edges;
};
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : a node
Function does : visits all the edges and counts the support
Function returns: int (count of support)
*/
int getSupport(node * N){
int support = 0;
for(std::vector<edge *>::iterator ed = N->eds.begin() ;
ed != N->eds.end(); ed++){
support += (*ed)->support['L'];
support += (*ed)->support['H'];
support += (*ed)->support['S'];
support += (*ed)->support['I'];
support += (*ed)->support['D'];
support += (*ed)->support['V'];
support += (*ed)->support['M'];
support += (*ed)->support['R'];
support += (*ed)->support['X'];
support += (*ed)->support['A'];
support += (*ed)->support['Z'];
}
return support;
}
class breakpoint{
private:
std::map<char, std::string > typeMap;
std::vector<std::string> refs;
std::vector<std::string> seqNames;
std::map<std::string, int> sms;
char type;
std::string typeName;
int totalGraphWeight ;
bool paired ;
bool bnd ;
bool masked ;
bool clustered;
bool print ;
long int length;
int nClustered ;
double totalCount ;
double tooFarCount ;
double tooCloseCount ;
double internalDCount;
double splitCount ;
double evertCount ;
double ssCount ;
double delCount ;
double insCount ;
double dupCount ;
double invCount ;
double traCount ;
double clusterFrac ;
double avgDist ;
void _count(node * n, std::map<edge *, int> & lu){
for(std::vector<edge *>::iterator ed = n->eds.begin() ;
ed != n->eds.end(); ed++){
if(lu.find(*ed) != lu.end()){
continue;
}
lu[*ed] = 1;
if((*ed)->L->seqid == (*ed)->R->seqid){
ssCount += (*ed)->support['M'] ;
ssCount += (*ed)->support['A'] ;
ssCount += (*ed)->support['R'] ;
splitCount += (*ed)->support['S'] ;
splitCount += (*ed)->support['V'] ;
splitCount += (*ed)->support['Z'] ;
tooFarCount += (*ed)->support['H'] ;
tooFarCount += (*ed)->support['M'] ;
evertCount += (*ed)->support['X'] ;
internalDCount += (*ed)->support['D'] ;
delCount += (*ed)->support['H'] ;
delCount += (*ed)->support['D'] ;
delCount += (*ed)->support['S'] ;
insCount += (*ed)->support['I'] ;
insCount += (*ed)->support['R'] ;
insCount += (*ed)->support['L'] ;
invCount += (*ed)->support['M'] ;
invCount += (*ed)->support['A'] ;
invCount += (*ed)->support['R'] ;
invCount += (*ed)->support['V'] ;
dupCount += (*ed)->support['Z'] ;
dupCount += (*ed)->support['S'] ;
dupCount += (*ed)->support['X'] ;
}
else{
traCount += (*ed)->support['H'] ;
traCount += (*ed)->support['D'] ;
traCount += (*ed)->support['S'] ;
traCount += (*ed)->support['I'] ;
traCount += (*ed)->support['R'] ;
traCount += (*ed)->support['L'] ;
traCount += (*ed)->support['M'] ;
traCount += (*ed)->support['A'] ;
traCount += (*ed)->support['V'] ;
traCount += (*ed)->support['S'] ;
traCount += (*ed)->support['X'] ;
traCount += (*ed)->support['Z'] ;
}
totalCount += (*ed)->support['H'];
totalCount += (*ed)->support['D'];
totalCount += (*ed)->support['S'];
totalCount += (*ed)->support['I'];
totalCount += (*ed)->support['R'];
totalCount += (*ed)->support['L'];
totalCount += (*ed)->support['M'];
totalCount += (*ed)->support['A'];
totalCount += (*ed)->support['V'];
totalCount += (*ed)->support['V'];
totalCount += (*ed)->support['S'];
totalCount += (*ed)->support['X'];
totalCount += (*ed)->support['Z'];
}
}
void _processInternal(void){
if(nodeL->seqid != nodeR->seqid ){
if(nodeL->seqid > nodeR->seqid){
node * tmp;
tmp = nodeL;
nodeL = nodeR;
nodeR = tmp;
}
}
else{
if(nodeL->pos > nodeR->pos){
node * tmp;
tmp = nodeL;
nodeL = nodeR;
nodeR = tmp;
}
this->length = this->nodeR->pos - nodeL->pos;
}
}
public:
node * nodeL;
node * nodeR;
breakpoint(void): type('D')
, totalGraphWeight(0)
, paired(false)
, bnd(false)
, masked(false)
, clustered(false)
, print(true)
, length(0)
, nClustered(0)
, totalCount(0)
, tooFarCount(0)
, tooCloseCount(0)
, internalDCount(0)
, splitCount(0)
, evertCount(0)
, ssCount(0)
, delCount(0)
, insCount(0)
, dupCount(0)
, invCount(0)
, traCount(0)
, clusterFrac(0)
, avgDist(0)
, nodeL(NULL)
, nodeR(NULL){
typeMap['D'] = "DEL";
typeMap['U'] = "DUP";
typeMap['T'] = "BND";
typeMap['I'] = "INS";
typeMap['V'] = "INV";
}
int getNClustered(void){
return this->nClustered;
}
double getSMSupport(std::string & s){
if(sms.find(s) == sms.end()){
return 0;
}
else{
return sms[s];
}
}
double getDelCount(void){
return this->delCount;
}
double getDupCount(void){
return this->dupCount;
}
double getInvCount(void){
return this->invCount;
}
double getTraCount(void){
return this->traCount;
}
double getInsCount(void){
return this->insCount;
}
double getClustFrac(void){
return this->clusterFrac;
}
double getSameStrandCount(void){
return this->ssCount;
}
double getTooFarCount(void){
return this->tooFarCount;
}
double getTooCloseCount(void){
return this->tooCloseCount;
}
double getInternalDelCount(void){
return this->internalDCount;
}
double getEvertCount(void){
return evertCount;
}
double getSplitReadCount(void){
return splitCount;
}
char getType(void){
return this->type;
}
double getAvgDist(void){
return this->avgDist;
}
long int getLength(void){
return length;
}
void setGoodPair(bool t){
this->paired = t;
}
bool IsMasked(void){
return this->masked;
}
bool IsPrint(void){
return this->print;
}
void unsetMasked(void){
this->masked = false;
}
void setMasked(void){
this->masked = true;
}
void unSetPrint(void){
this->print = false;
}
void setBadPair(void){
this->paired = true;
}
void setTotalSupport(int t){
this->totalCount = t;
}
bool IsBadPair(void){
return this->paired;
}
int getTotalSupport(void){
return this->totalCount;
}
bool add(node * n){
if(nodeL != NULL && nodeR != NULL){
return false;
}
if( nodeL == NULL ){
nodeL = n;
return true;
}
else{
nodeR = n;
_processInternal();
return true;
}
}
bool countSupportType(void){
if(nodeL == NULL || nodeR == NULL){
return false;
}
// make sure we dont double count edges
std::map<edge *, int> lu;
_count(nodeL, lu);
_count(nodeR, lu);
return true;
}
bool calcType(void){
if(nodeL == NULL || nodeR == NULL){
return false;
}
int maxN = delCount;
if(dupCount > maxN){
type = 'U';
maxN = dupCount;
}
if(invCount > maxN){
type = 'V';
maxN = invCount;
}
if(insCount > maxN){
type = 'I';
maxN = insCount;
}
if(traCount > maxN){
type = 'T';
maxN = traCount;
}
typeName = typeMap[type];
return true;
}
void getRefBases(std::string & reffile, std::map<int, string> & lookup){
FastaReference rs;
rs.open(reffile);
refs.push_back(rs.getSubSequence(lookup[nodeL->seqid],
nodeL->pos, 1));
if(nodeL->seqid != nodeR->seqid){
refs.push_back(rs.getSubSequence(lookup[nodeR->seqid],
nodeR->pos, 1));
}
seqNames.push_back(lookup[nodeL->seqid]);
seqNames.push_back(lookup[nodeR->seqid]);
}
friend std::ostream& operator<<(std::ostream& out,
const breakpoint& foo);
bool dupClusterCheck(void){
return dupClusterCheck(nodeL, nodeR);
}
bool invClusterCheck(void){
return invClusterCheck(nodeL, nodeR);
}
bool delClusterCheck(void){
return delClusterCheck(nodeL, nodeR);
}
bool dupClusterCheck(node * left, node * right){
if(left == NULL || right == NULL){
return false;
}
if(type != 'U'){
return false;
}
if(left->seqid != right->seqid ){
return false;
}
if(left->pos > right->pos){
node * tmp;
tmp = left;
left = right;
right = tmp;
}
int countY = 0;
int countN = 0;
long int distSum = 0;
int distCount = 0;
for( std::vector<edge*>::iterator it = left->eds.begin();
it != left->eds.end(); it++){
// same strand too far or too close
if( (*it)->support['X'] == 0 ){
continue;
}
// don't want the same node or non leaf
node * tmp = (*it)->L;
if(tmp->eds.size() > 1 || tmp == left ){
tmp = (*it)->R;
}
if(tmp->eds.size() > 1){
continue;
}
long int distL = abs(left->pos - tmp->pos);
long int distR = abs(right->pos - tmp->pos);
if(distR < distL){
countY += (*it)->support['X'];
distSum += distR;
distCount += 1;
}
else{
countN += (*it)->support['X'];
}
}
for( std::vector<edge*>::iterator it = right->eds.begin();
it != right->eds.end(); it++){
// same strand too far or too close
if( (*it)->support['X'] == 0 ){
continue;
}
// don't want the same node or non leaf
node * tmp = (*it)->L;
if(tmp->eds.size() > 1 || tmp == nodeR ){
tmp = (*it)->R;
}
if(tmp->eds.size() > 1){
continue;
}
long int distL = abs(left->pos - tmp->pos);
long int distR = abs(right->pos - tmp->pos);
if(distR < distL){
countY += (*it)->support['X'];
distSum += distR;
distCount += 1;
}
else{
countN += (*it)->support['X'];
}
}
int sum = countY + countN;
if(sum > 0){
clusterFrac = double(countY) / double(sum);
return true;
}
if(distCount > 0){
avgDist = double(distSum) / double(distCount);
}
if(sum > 0 && distCount > 0){
return true;
}
return false;
}
bool invClusterCheck(node * left, node * right){
if(left == NULL || right == NULL){
return false;
}
if(type != 'V'){
return false;
}
if(left->seqid != right->seqid ){
return false;
}
if(left->pos > right->pos){
node * tmp;
tmp = left;
left = right;
right = tmp;
}
int countY = 0;
int countN = 0;
long int distSum = 0;
int distCount = 0;
for( std::vector<edge*>::iterator it = left->eds.begin();
it != left->eds.end(); it++){
// same strand too far or too close
if((*it)->support['R'] == 0 && (*it)->support['A']){
continue;
}
// don't want the same node or non leaf
node * tmp = (*it)->L;
if(tmp->eds.size() > 1 || tmp == left ){
tmp = (*it)->R;
}
if(tmp->eds.size() > 1){
continue;
}
if(tmp->pos < left->pos || tmp->pos > right->pos){
continue;
}
long int distL = abs(left->pos - tmp->pos);
long int distR = abs(right->pos - tmp->pos);
if(distR < distL){
countY += (*it)->support['R'];
countY += (*it)->support['A'];
countY += (*it)->support['M'];
distSum += distR;
distCount += 1;
}
else{
countN += (*it)->support['R'];
countN += (*it)->support['A'];
countN += (*it)->support['M'];
}
}
for(std::vector<edge*>::iterator it = right->eds.begin();
it != right->eds.end(); it++){
// same strand too far or too close
if((*it)->support['R'] == 0 && (*it)->support['A']){
continue;
}
// don't want the same node or non leaf
node * tmp = (*it)->L;
if(tmp->eds.size() > 1 || tmp == left ){
tmp = (*it)->R;
}
if(tmp->eds.size() > 1){
continue;
}
if(tmp->pos < left->pos || tmp->pos > right->pos){
continue;
}
// same strands need to be internal to the inversion
if(tmp->pos > left->pos
&& tmp->pos < right->pos){
long int distL = abs(left->pos - tmp->pos);
long int distR = abs(right->pos - tmp->pos);
if(distR > distL){
countY += (*it)->support['R'];
countY += (*it)->support['A'];
countY += (*it)->support['M'];
distSum += distR;
distCount += 1;
}
else{
countN += (*it)->support['R'];
countN += (*it)->support['A'];
countN += (*it)->support['M'];
}
}
}
int sum = countY + countN;
if(sum > 0){
clusterFrac = double(countY) / double(sum);
}
if(distCount > 0){
avgDist = double(distSum) / double(distCount);
}
if(sum > 0 && distCount > 0){
return true;
}
return false;
}
void loadSMSupport(void){
for(std::map<std::string, int>::iterator sm = nodeL->sm.begin();
sm!= nodeL->sm.end(); sm++){
if(sms.find(sm->first) == sms.end() ){
sms[sm->first] = sm->second;
}
else{
sms[sm->first] += sm->second;
}
}
for(std::map<std::string, int>::iterator sm = nodeR->sm.begin();
sm!=nodeR->sm.end(); sm++){
if(sms.find(sm->first) == sms.end() ){
sms[sm->first] = sm->second;
}
else{
sms[sm->first] += sm->second;
}
}
}
bool delClusterCheck(node * left, node * right){
if(left == NULL || right == NULL){
return false;
}
if(type != 'D'){
return false;
}
if(left->seqid != right->seqid ){
return false;
}
if(left->pos > right->pos){
node * tmp;
tmp = left;
left = right;
right = tmp;
}
int countY = 0;
int countN = 0;
long int distSum = 0;
int distCount = 0;
/* loop over 5' node (always sorted)
- check if left pos = 5' node, if yes switch pos
- count those nodes to the right of 3' node
*/
for( std::vector<edge*>::iterator it = left->eds.begin();
it != left->eds.end(); it++){
if((*it)->support['H'] == 0){
continue;
}
long int five = (*it)->L->pos;
if((*it)->L == left){
five = (*it)->R->pos;
}
if(five > right->pos ){
countY += (*it)->support['H'];
distSum += abs(five - right->pos);
distCount += 1;
}
}
/* loop over 3' node (always sorted)
- check if right pos = 3' node, if yes switch pos
- count those nodes to the left of 5' node
*/
for( std::vector<edge*>::iterator it = right->eds.begin();
it != right->eds.end(); it++){
if((*it)->support['H'] == 0){
continue;
}
long int three = (*it)->L->pos;
if((*it)->L == right){
three = (*it)->R->pos;
}
if( three < left->pos ){
countY += (*it)->support['H'];
distSum += abs(three - left->pos);
distCount += 1;
}
}
int sum = countY + countN;
if(sum > 0){
clusterFrac = double(countY) / double(sum);
}
if(distCount > 0){
avgDist = double(distSum) / double(distCount);
}
if(sum > 0 && distCount > 0){
nClustered = countY;
return true;
}
return false;
}
};
std::ostream& operator<<(std::ostream& out, const breakpoint & foo){
long int start = foo.nodeL->pos ;
long int end = foo.nodeR->pos ;
long int len = abs(foo.nodeR->pos - foo.nodeL->pos);
if(foo.type == 'I'){
end = start;
}
if(foo.type == 'D'){
len = -1*len;
}
double sum = foo.delCount + foo.dupCount
+ foo.traCount + foo.invCount + foo.insCount;
stringstream ss;
int index = 0;
for(std::map<std::string, int>::const_iterator sm = foo.sms.begin();
sm != foo.sms.end(); sm++){
if(index == 0){
ss << sm->first;
}
else{
ss << "," << sm->first;
}
index += 1;
}
if(foo.type != 'T' && (foo.nodeL->seqid == foo.nodeR->seqid)){
out << foo.seqNames.front() << "\t"
<< start + 1 << "\t"
<< "." << "\t"
<< foo.refs.front() << "\t"
<< "<" << foo.typeName << ">\t"
<< "." << "\t"
<< "PASS" << "\t"
<< "A=" << foo.totalCount
<< ";CIEND=-10,10;CIPOS=-10,10"
<< ";CF=" << foo.clusterFrac
<< ";CW=" << (foo.delCount / sum) << ","
<< (foo.dupCount / sum) << ","
<< (foo.invCount / sum) << ","
<< (foo.insCount / sum) << ","
<< (foo.traCount / sum)
<< ";D=" << foo.delCount
<< ";DI=" << foo.avgDist
<< ";END=" << end + 1
<< ";EV=" << foo.evertCount
<< ";I=" << foo.insCount
<< ";SR=" << foo.splitCount
<< ";SS=" << foo.ssCount
<< ";SVLEN=" << len
<< ";SVTYPE=" << foo.typeName
<< ";T=" << foo.traCount
<< ";TAGS=" << ss.str()
<< ";TF=" << foo.tooFarCount
<< ";U=" << foo.dupCount
<< ";V=" << foo.invCount;
}
return out;
}
struct libraryStats{
std::map<std::string, double> mus ; // mean of insert length for each indvdual across 1e6 reads
std::map<std::string, double> sds ; // standard deviation
std::map<std::string, double> low ; // 25% of data
std::map<std::string, double> upr ; // 75% of the data
std::map<std::string, double> swm ;
std::map<std::string, double> sws ;
std::map<std::string, double> avgD;
std::map<std::string, std::map<int, long int> > mqD;
double overallDepth;
} ;
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : left and right node
Function does : return true if the left is less than the right
Function returns: bool
*/
bool sortNodesBySupport(node * L, node * R){
return (getSupport(L) > getSupport(R)) ;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : a left and right node
Function does : provides the rule for the sort
Function returns: bool
*/
bool sortNodesByPos(node * L, node * R){
return (L->pos < R->pos);
}
//------------------------------- SUBROUTINE --------------------------------
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : edge pointer
Function does : init
Function returns: void
*/
void initEdge(edge * e){
e->L = NULL;
e->R = NULL;
// mate too close
e->support['L'] = 0;
// mate too far
e->support['H'] = 0;
// split read
e->support['S'] = 0;
// split read different strand (split)
e->support['V'] = 0;
// insertion
e->support['I'] = 0;
// deletion
e->support['D'] = 0;
// same strand too far
e->support['M'] = 0;
// same strand too close
e->support['R'] = 0;
// everted read pairs
e->support['X'] = 0;
e->support['K'] = 0;
// same strand
e->support['A'] = 0;
// split reads on wrong side of soft clip
e->support['z'] = 0;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : refID (int), left position (int), graph
Function does : determine if a node is in the graph
Function returns: bool
*/
inline bool isInGraph(int refID, int pos, graph & lc){
if(lc.nodes.find(refID) == lc.nodes.end()){
return false;
}
if(lc.nodes[refID].find(pos) != lc.nodes[refID].end()){
return true;
}
return false;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : two node pointer
Function does : finds if nodes are directly connected
Function returns: bool
*/
bool connectedNode(node * left, node * right){
for(std::vector<edge *>::iterator l = left->eds.begin();
l != left->eds.end(); l++){
if(((*l)->L->pos == right->pos
|| (*l)->R->pos == right->pos)
&& (*l)->R->seqid == right->seqid ){
return true;
}
}
return false;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : a vector of edge pointers
Function does : does a tree terversal and joins nodes
Function returns: NA
*/
bool findEdge(std::vector<edge *> & eds, edge ** e, int pos){
for(std::vector<edge *>::iterator it = eds.begin(); it != eds.end(); it++){
if((*it)->L->pos == pos || (*it)->R->pos == pos ){
(*e) = (*it);
return true;
}
}
return false;
}
//------------------------------- SUBROUTINE --------------------------------
/*
Function input : a vector of node pointers, and a postion
Function does : removes edges that contain a position
Function returns: NA
*/
void removeEdges(std::vector<node *> & tree, int pos){
for(std::vector<node *>::iterator rm = tree.begin();
rm != tree.end(); rm++){
std::vector<edge *> tmp;
for(std::vector<edge *>::iterator e = (*rm)->eds.begin();
e != (*rm)->eds.end(); e++){
if( (*e)->L->pos != pos && (*e)->R->pos != pos ){
tmp.push_back((*e));
}
else{
}
}
(*rm)->eds.clear();
(*rm)->eds.insert((*rm)->eds.end(), tmp.begin(), tmp.end());
}
}
#endif