diff --git a/src/interval_tree.cpp b/src/interval_tree.cpp index 8ee87fc8..316e9519 100755 --- a/src/interval_tree.cpp +++ b/src/interval_tree.cpp @@ -1,5 +1,6 @@ #include "interval_tree.h" - +#include +using namespace std::chrono; // Constructor for initializing an Interval Tree IntervalTree::IntervalTree() { _root = NULL; } @@ -32,13 +33,9 @@ void IntervalTree::combine_haplotypes(ITNode *root){ for(uint32_t i=0; i < root->amp_positions.size(); i++){ int exists = check_position_exists(root->amp_positions[i].pos, variants); //does exist - if (exists != -1){ - variants[exists].depth += root->amp_positions[i].depth; - std::vector new_alleles = add_allele_vectors(root->amp_positions[i].alleles, variants[exists].alleles); - variants[exists].alleles = new_alleles; - } else { - variants.push_back(root->amp_positions[i]); - } + variants[root->amp_positions[i].pos].depth += root->amp_positions[i].depth; + std::vector new_alleles = add_allele_vectors(root->amp_positions[i].alleles, variants[exists].alleles); + variants[root->amp_positions[i].pos].alleles = new_alleles; } combine_haplotypes(root->right); } @@ -58,7 +55,7 @@ void IntervalTree::detect_abberations(ITNode *root, uint32_t find_position){ } -void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, uint8_t* quality) { +void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, uint8_t* qual) { uint32_t consumed_query = 0; uint32_t consumed_ref = 0; std::vector sc_positions; //sc positions @@ -66,15 +63,38 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32 //first we handle insertion from the cigar std::vector useful; std::vector sequence; + std::vector quality; + + uint8_t nt = 0; + uint32_t length = 0; + uint32_t ll = 0; + uint32_t op; + for(uint32_t i=0; i < nlength; i++){ + op = bam_cigar_op(cigar[i]); + if (bam_cigar_type(op) & 1){ + length = bam_cigar_oplen(cigar[i]); + for(uint32_t k=0; k < length; k++){ + useful.push_back(ll+k); + } + ll += length; + } + } + for(uint32_t k=0; k < useful.size(); k++){ + nt = seq_nt16_str[bam_seqi(seq, useful[k])]; + sequence.push_back(nt); + quality.push_back(qual[useful[k]]);; + } + useful.clear(); + for(uint32_t j=0; j < nlength; j++){ - uint32_t op = bam_cigar_op(cigar[j]); + op = bam_cigar_op(cigar[j]); uint32_t oplen = bam_cigar_oplen(cigar[j]); if(op == 1){ for(uint32_t k=0; k < oplen; k++){ std::ostringstream convert; //convert data type to get the characters ignore_sequence.push_back(k+consumed_ref+start_pos); - convert << sequence[k+consumed_query]; + convert << sequence[k+consumed_query]; //this is throwing error std::string nuc = "+" + convert.str(); //check if this position exists int exists = check_position_exists(start_pos+consumed_ref, variants); @@ -102,9 +122,6 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32 } //consumes query if (bam_cigar_type(op) & 1){ - for(uint32_t k=0; k < oplen; k++){ - useful.push_back(consumed_query+k); - } consumed_query += oplen; } //consumes ref @@ -112,12 +129,6 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32 consumed_ref += oplen; } } - uint8_t nt = 0; - for(uint32_t k=0; k < useful.size(); k++){ - nt = seq_nt16_str[bam_seqi(seq, useful[k])]; - sequence.push_back(nt); - } - //we will use the aux tag to handle deletions bool deletion = false; uint32_t current_pos = start_pos; @@ -127,7 +138,7 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32 uint32_t last_char = 0; uint32_t j = 0; do { - char character = (char) aux[j]; + char character = (char)aux[j]; if (character == '^'){ current_pos += std::stoi(gather_digits); gather_digits = ""; @@ -168,6 +179,7 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32 //now that we know where the insertions and deletions are, let's just iterate the query sequence and add it in, skipping problem positions current_pos = start_pos; //j is relative to the sequence and current pos to the reference + //auto stop_again = high_resolution_clock::now(); for(uint32_t j=0; j < sequence.size(); j++){ std::vector::iterator it = find(deletion_positions.begin(), deletion_positions.end(), current_pos); if (it != deletion_positions.end()) { @@ -184,21 +196,23 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32 continue; } current_pos += 1; - int exists = check_position_exists(current_pos, variants); std::ostringstream convert; convert << sequence[j]; std::string nuc = convert.str(); - if (exists != -1) { - variants[exists].update_alleles(nuc, 1, quality[j]); - } else { - position add_pos; - add_pos.pos = current_pos; //don't add a position - add_pos.update_alleles(nuc, 1, quality[j]); - variants.push_back(add_pos); - } + variants[current_pos].update_alleles(nuc, 1, quality[j]); } + //auto stop_again_again = high_resolution_clock::now(); + //auto duration_again_again = duration_cast(stop_again_again - stop_again); + //std::cerr << duration_again_again.count() << std::endl; } +void IntervalTree::populate_variants(){ + for(uint32_t i=0; i <= max_pos; i++){ + position tmp; + tmp.pos = i; + variants.push_back(tmp); + } +} void IntervalTree::set_haplotypes(ITNode *root, primer prim){ if (root==NULL) return; char strand = prim.get_strand(); diff --git a/src/interval_tree.h b/src/interval_tree.h index 7ec25b93..0909d95e 100755 --- a/src/interval_tree.h +++ b/src/interval_tree.h @@ -69,6 +69,7 @@ class IntervalTree { void detect_abberations(uint32_t pos) {detect_abberations(_root, pos);} void combine_haplotypes() {combine_haplotypes(_root);} void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality); + void populate_variants(); }; void combine_haplotypes(); @@ -76,6 +77,7 @@ void detect_abberations(ITNode *root, uint32_t find_position); void get_max_pos(); void set_haplotypes(ITNode *root, primer prim); void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality); +void populate_variants(); IntervalTree populate_amplicons(std::string pair_info_file, std::vector &primers); #endif diff --git a/src/saga.cpp b/src/saga.cpp index d729e325..c7c87a05 100644 --- a/src/saga.cpp +++ b/src/saga.cpp @@ -47,6 +47,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out, std::cerr << "Amplicons detected: " << std::endl; amplicons.inOrder(); amplicons.get_max_pos(); + amplicons.populate_variants(); std::cerr << "Maximum position " << amplicons.max_pos << std::endl; } else{ std::cerr << "Exiting." << std::endl; @@ -112,8 +113,6 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out, // continue; //} overlapping_primers.clear(); - //TODO handle unpaired - //esp important for possible next era sequencing if(strand == '+'){ for(uint32_t i=start_pos-10; i < start_pos+10; i++){ if (i < 0 || i > amplicons.max_pos) continue; @@ -145,7 +144,6 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out, //TODO handle the case of unpaired reads if (overlapping_primers.size() == 0){ amplicons.add_read_variants(cigar, aln->core.pos, nlength, seq, aux, qualities); - exit(1); outside_amp += 1; continue; } @@ -155,8 +153,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out, for(uint32_t j=0; j < primers.size(); j++){ uint32_t pstart = primers[j].get_start(); - uint32_t pend = primers[j].get_end(); - + uint32_t pend = primers[j].get_end(); if (start == pstart && end == pend){ primers[j].add_cigarotype(cigar, aln->core.pos, nlength, seq, aux, bam_get_qname(aln), qualities); } @@ -245,7 +242,5 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out, bam_destroy1(aln); bam_hdr_destroy(header); sam_close(in); - //end, data has been appropriately preprocessed and problematic positions have been flagged - //room for extension to calcualte physical linkage in the future return(retval); }