Skip to content

Commit

Permalink
adding in quality scores to variants output
Browse files Browse the repository at this point in the history
  • Loading branch information
cmaceves committed Dec 14, 2023
1 parent af8c4b7 commit 164fd58
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 35 deletions.
4 changes: 3 additions & 1 deletion src/allele_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ uint32_t sum_allele_depths(std::vector<allele> test){
return(counter);
}

void position::update_alleles(std::string nt, uint32_t count){
void position::update_alleles(std::string nt, uint32_t count, uint32_t qual){
//update overall positions depth
depth += count;
//check if in allele vector
Expand All @@ -50,10 +50,12 @@ void position::update_alleles(std::string nt, uint32_t count){
//allele does not exist
if (exists == -1){
allele tmp;
tmp.mean_qual = qual;
tmp.depth = count;
tmp.nuc = nt;
alleles.push_back(tmp);
} else {
alleles[exists].mean_qual += qual;
alleles[exists].depth += count;
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/allele_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ struct allele {
std::string nuc;
uint32_t depth;
uint32_t reverse;
uint8_t mean_qual;
uint32_t mean_qual;
uint32_t beg;
uint32_t end;
float tmp_mean_qual;
Expand All @@ -28,12 +28,12 @@ class position {
uint32_t pos;
uint32_t depth=0;
std::vector<allele> alleles;
void update_alleles(std::string nt, uint32_t count);
void update_alleles(std::string nt, uint32_t count, uint32_t qual);
bool flux;
};

int check_position_exists(uint32_t p, std::vector<position> positions);
void update_alleles(std::string nt, uint32_t count);
void update_alleles(std::string nt, uint32_t count, uint32_t qual);
int check_allele_exists(std::string n, std::vector<allele> ad);
std::vector<allele> update_allele_depth(char ref, std::string bases,
std::string qualities,
Expand Down
1 change: 1 addition & 0 deletions src/interval_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ std::vector<allele> add_allele_vectors(std::vector<allele> new_alleles, std::vec
for(uint32_t j=0; j < return_alleles.size(); j++){
if (return_alleles[j].nuc == new_alleles[i].nuc){
return_alleles[j].depth += new_alleles[i].depth;
return_alleles[j].mean_qual += new_alleles[i].mean_qual;
found = true;
break;
}
Expand Down
59 changes: 36 additions & 23 deletions src/primer_bed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ std::vector<std::vector<uint32_t>> cigarotype::get_cigarotypes() { return cigaro

std::vector<std::vector<uint8_t>> cigarotype::get_aux_tags() { return aux_tags; }

std::vector<std::vector<uint32_t>> cigarotype::get_qualities() { return qualities; }

std::vector<std::vector<uint8_t>> cigarotype::get_sequences() { return sequences; }

std::vector<std::string> cigarotype::get_qnames() { return qnames; }
Expand Down Expand Up @@ -314,17 +316,18 @@ void primer::transform_mutations() {
std::vector<std::vector<uint8_t>> aux_tags = this->get_aux_tags();
std::vector<uint32_t> counts = this->get_count_cigarotypes();
std::vector<std::string> qnames = this->get_qnames();

std::vector<std::vector<uint32_t>> qualities = this->get_qualities();
//this tracks all mutations at all positions
std::string test = "";
//here let's turn the cigar string into a vector of alleles specific to this primer
//iterate all unique sequences
for(uint32_t i=0; i < cigarotypes.size(); i++){
//get the cigar string and start pos
std::vector<uint32_t> quality = qualities[i];
std::vector<uint32_t> cigarotype = cigarotypes[i]; //carries the insertions
std::vector<uint8_t> aux_tag = aux_tags[i]; //carries deletions
std::vector<uint8_t> sequence = sequences[i]; //carries NT values
std::vector<uint32_t> sc_positions; //sc positions
std::vector<uint32_t> sc_positions; //sc positions
uint32_t ccount = counts[i];
uint32_t start_pos = start_positions[i]; // pos after soft-clipped region
std::string qname = qnames[i];
Expand All @@ -338,23 +341,23 @@ void primer::transform_mutations() {
//honestly this whole bit could be better - more general
//insertions
if(op == 1){
std::ostringstream convert;
for(uint32_t k=0; k < oplen; k++) {
std::ostringstream convert;
//convert data type to get the characters
ignore_sequence.push_back(k+consumed_ref+start_pos);
convert << sequence[k+consumed_query];
}
std::string nuc = "+" + convert.str();
//check if this position exists
int exists = check_position_exists(start_pos+consumed_ref, positions);
if (exists != -1) {
positions[exists].update_alleles(nuc, ccount);
} else {
//add position to vector
position add_pos;
add_pos.pos = start_pos+consumed_ref; //don't add a position
add_pos.update_alleles(nuc, ccount);
positions.push_back(add_pos);
std::string nuc = "+" + convert.str();
//check if this position exists
int exists = check_position_exists(start_pos+consumed_ref, positions);
if (exists != -1) {
positions[exists].update_alleles(nuc, ccount, quality[k+consumed_query]);
} else {
//add position to vector
position add_pos;
add_pos.pos = start_pos+consumed_ref; //don't add a position
add_pos.update_alleles(nuc, ccount, quality[k+consumed_query]);
positions.push_back(add_pos);
}
}
consumed_query += oplen;
continue;
Expand Down Expand Up @@ -396,12 +399,12 @@ void primer::transform_mutations() {
} else if (isalpha(character) && deletion) {
int exists = check_position_exists(current_pos, positions);
if (exists != -1) {
positions[exists].update_alleles("-", ccount);
positions[exists].update_alleles("-", ccount, 0);
} else {
//add position to vector
position add_pos;
add_pos.pos = current_pos; //don't add a position
add_pos.update_alleles("-", ccount);
add_pos.update_alleles("-", ccount, 0);
positions.push_back(add_pos);
}
deletion_positions.push_back(current_pos);
Expand Down Expand Up @@ -446,24 +449,24 @@ void primer::transform_mutations() {
convert << sequence[j];
std::string nuc = convert.str();
if (exists != -1) {
positions[exists].update_alleles(nuc, ccount);
positions[exists].update_alleles(nuc, ccount, quality[j]);
} else {
position add_pos;
add_pos.pos = current_pos; //don't add a position
add_pos.update_alleles(nuc, ccount);
add_pos.update_alleles(nuc, ccount, quality[j]);
positions.push_back(add_pos);
}
}
}
}

//cigar must be passed by pointer due to array decay
void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, std::string qname){
void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, std::string qname, uint8_t *quality){
bool found = false; //have we seen this before
std::vector<uint8_t> saved_aux; //placeholder for saved sequence
std::vector<uint32_t> saved_cigarotype; //placeholder for saved cigarotypes
uint32_t sp; //placeholder for saved starting position

std::vector<uint32_t> qual_reformat;
std::vector<uint32_t> cigar_reformat;
std::vector<uint8_t> seq_reformat;
std::vector<uint8_t> aux_reformat;
Expand All @@ -473,7 +476,7 @@ void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t n
std::vector<uint32_t> useful;
//first handle the array decay aspect
for(uint32_t i=0; i < nlength; i++){
cigar_reformat.push_back(cigar[i]);
cigar_reformat.push_back(cigar[i]);
uint32_t op = bam_cigar_op(cigar[i]);
//consumes query
if (bam_cigar_type(op) & 1){
Expand All @@ -489,14 +492,21 @@ void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t n
aux_reformat.push_back(aux[i]);
i++;
} while(aux[i] != '\0');



for(uint32_t k=0; k < useful.size(); k++){
qual_reformat.push_back(quality[useful[k]]);
}
for(uint32_t i=0; i < cigarotypes.size(); i++){
sp = ncigarotypes[i];
saved_aux = aux_tags[i];
saved_cigarotype = cigarotypes[i];
if(cigar_reformat == saved_cigarotype && start_pos == sp && aux_reformat == saved_aux){
found = true;
count_cigarotypes[i] += 1;
for(uint32_t k = 0; k < qual_reformat.size(); k++){
qualities[i][k] += (uint32_t) qual_reformat[k];
}
break;
}
}
Expand All @@ -507,11 +517,14 @@ void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t n
nt = seq_nt16_str[bam_seqi(seq, useful[k])];
seq_reformat.push_back(nt);
}

qualities.push_back(qual_reformat);
cigarotypes.push_back(cigar_reformat);
ncigarotypes.push_back(start_pos);
sequences.push_back(seq_reformat);
aux_tags.push_back(aux_reformat);
qnames.push_back(qname);

uint32_t digit = 1;
count_cigarotypes.push_back(digit);
nlengths.push_back(nlength);
Expand Down
7 changes: 5 additions & 2 deletions src/primer_bed.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,18 @@ class cigarotype{
std::vector<uint32_t> count_cigarotypes; //count of amount found
std::vector<std::vector<uint8_t>> aux_tags; //auxillary tags
std::vector<std::vector<uint8_t>> sequences; //sequences
std::vector<std::vector<uint32_t>> qualities; //qualities
std::vector<std::string> qnames; //TODO remove this at the end, qname
public:
std::vector<std::vector<uint32_t>> get_cigarotypes();
std::vector<std::vector<uint8_t>> get_aux_tags();
std::vector<std::vector<uint8_t>> get_sequences();
std::vector<std::vector<uint32_t>> get_qualities();
std::vector<uint32_t> get_count_cigarotypes();
std::vector<uint32_t> get_start_positions();
std::vector<uint32_t> get_nlengths();
std::vector<std::string> get_qnames();
void add_cigarotype(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, std::string qname);
void add_cigarotype(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, std::string qname, uint8_t *quality);
int test = 12;

};
Expand Down Expand Up @@ -81,9 +83,10 @@ void print_primer_info(primer primers);
void print_all_primer_info(std::vector<primer> primers);
primer get_min_start(std::vector<primer> primers);
primer get_max_end(std::vector<primer> primers);
void add_cigarotype(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, std::string qname);
void add_cigarotype(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, std::string qname, uint8_t *quality);
std::vector<std::vector<uint32_t>> get_cigarotypes();
std::vector<std::vector<uint8_t>> get_aux_tags();
std::vector<std::vector<uint32_t>> get_qualities();
std::vector<std::vector<uint8_t>> get_sequences();
std::vector<uint32_t> get_count_cigarotypes();
std::vector<uint32_t> get_nlengths();
Expand Down
14 changes: 8 additions & 6 deletions src/saga.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
continue;
}
overlapping_primers.clear();
//for this case, we've already trimmed so the starting pos will be shifted
//TODO look instead for primers matching a small range!
//TODO handle unpaired
if(strand == '+'){
for(uint32_t i=start_pos-10; i < start_pos+10; i++){
//std::cerr << "i " << i << " " << start_pos << std::endl;
Expand Down Expand Up @@ -147,7 +146,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
//get cigar for the read
uint32_t *cigar = bam_get_cigar(r);
uint32_t nlength = r->core.n_cigar;

uint8_t *qualities = bam_get_qual(r);
//assign to a primer not an amplicon, because here direction matters
//TODO handle the case of unpaired reads
if (overlapping_primers.size() == 0){
Expand All @@ -163,7 +162,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
uint32_t pend = primers[j].get_end();

if (start == pstart && end == pend){
primers[j].add_cigarotype(cigar, aln->core.pos, nlength, seq, aux, bam_get_qname(aln));
primers[j].add_cigarotype(cigar, aln->core.pos, nlength, seq, aux, bam_get_qname(aln), qualities);
}
}
}
Expand All @@ -173,6 +172,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
for(uint32_t i=0; i < primers.size(); i++){
primers[i].transform_mutations();
}
//exit(1);
std::cerr << "setting amplicon level haplotypes" << std::endl;
//AMPLICON METHOD translate this into amplicon haplotype obj of mutations per primer (ie. variant freq per amplicon)
for (uint32_t i=0; i < primers.size(); i++){
Expand Down Expand Up @@ -253,8 +253,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
file << variants[i].alleles[j].nuc << "\t";
file << std::to_string(variants[i].alleles[j].depth) << "\t";
file << std::to_string(freq) << "\t";
//TODO QUALITY
file << std::to_string(20) << "\t";
file << std::to_string((float)variants[i].alleles[j].mean_qual / (float)variants[i].alleles[j].depth) << "\t";
std::vector<uint32_t>::iterator it;
it = find(flagged_positions.begin(), flagged_positions.end(), variants[i].pos);
if (it != flagged_positions.end()){
Expand All @@ -267,6 +266,9 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,

}
file.close();
bam_destroy1(aln);
bam_hdr_destroy(header);
sam_close(in);
//end, data has been appropriately preprocessed and problematic positions have been flagged
//room for extension to calcualte physical linkage in the future
return(retval);
Expand Down

0 comments on commit 164fd58

Please sign in to comment.