Skip to content

Commit

Permalink
starting rewrite for paired reads
Browse files Browse the repository at this point in the history
  • Loading branch information
cmaceves committed Jan 10, 2025
1 parent cb4a9cc commit cd8b450
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 101 deletions.
44 changes: 28 additions & 16 deletions src/gmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ gaussian_mixture_model retrain_model(uint32_t n, arma::mat data, std::vector<var

gaussian_mixture_model gmodel;
gmodel.n = n;

// Matrix to store the centroids
arma::mat centroids;
arma::mat initial_means(1, n, arma::fill::zeros);
Expand All @@ -191,9 +192,9 @@ gaussian_mixture_model retrain_model(uint32_t n, arma::mat data, std::vector<var
std::vector<double> total_distances;
std::vector<std::vector<double>> all_centroids;

for(uint32_t j=0; j < 5; j++){
for(uint32_t j=0; j < 15; j++){
//std::cerr << "iteration j " << j << std::endl;
bool status2 = arma::kmeans(centroids, data, n, arma::random_subset, 15, false);
bool status2 = arma::kmeans(centroids, data, n, arma::random_subset, 10, false);
if(!status2) continue;

double total_dist = 0;
Expand All @@ -220,10 +221,10 @@ gaussian_mixture_model retrain_model(uint32_t n, arma::mat data, std::vector<var
std::cerr << range << std::endl;*/
std::vector<double> tmp;
for(auto c : centroids){
std::cerr << c << " ";
//std::cerr << c << " ";
tmp.push_back((double)c);
}
std::cerr << "\n";
//std::cerr << "\n";
all_centroids.push_back(tmp);
total_distances.push_back(total_dist);
}
Expand Down Expand Up @@ -975,7 +976,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
variants.push_back(base_variants[i]);
all_var.push_back(base_variants[i].freq);
count_pos.push_back(base_variants[i].position);
//std::cerr << base_variants[i].freq << " " << base_variants[i].position << " " << base_variants[i].nuc << std::endl;
std::cerr << base_variants[i].freq << " " << base_variants[i].position << " " << base_variants[i].nuc << std::endl;
}
}
std::cerr << "useful var " << useful_var << std::endl;
Expand Down Expand Up @@ -1050,16 +1051,25 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
}
std::cerr << "\n";
float percent_far = (float)count_far / (float)useful_var;
std::cerr << "mean " << mean << " mad " << mad << " cluster size " << data.size() << " count far " << count_far << " percent far " << percent_far << std::endl;

tmp_mads.push_back(mad);
tmp_percent_far.push_back(percent_far);

if(mad <= 0.10 && percent_far <= 0.10){
optimal = true;
float ratio = (float)useful_var / (float) counter;
std::cerr << "mean " << mean << " mad " << mad << " cluster size " << data.size() << " count far " << count_far << " percent far " << percent_far << " ratio " << ratio << std::endl;
if(ratio >= 5){
if(mad <= 0.10 && percent_far <= 0.10){
optimal = true;
} else {
optimal = false;
break;
}
} else {
optimal = false;
break;
if(mad <= 0.03 && percent_far <= 0.05){
optimal = true;
} else{
optimal = false;
break;
}
}
}

Expand All @@ -1077,11 +1087,13 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
percents.push_back(tmp_percent_far);
}
std::cerr << "optimal n " << optimal_n << std::endl;
retrained.means.clear();
retrained.hefts.clear();
retrained.prob_matrix.clear();
retrained = retrain_model(optimal_n, data, variants, lower_n, 0.001);
assign_clusters(variants, retrained, lower_n);
if(optimal_n != retrained.means.size()){
retrained.means.clear();
retrained.hefts.clear();
retrained.prob_matrix.clear();
retrained = retrain_model(optimal_n, data, variants, lower_n, 0.001);
assign_clusters(variants, retrained, lower_n);
}
std::vector<double> centroids;
std::vector<std::vector<double>> c_clusters(optimal_n);
for(auto var : variants){
Expand Down
66 changes: 52 additions & 14 deletions src/interval_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
using namespace std::chrono;
// Constructor for initializing an Interval Tree
IntervalTree::IntervalTree() { _root = NULL; }

std::vector<allele> add_allele_vectors(std::vector<allele> new_alleles, std::vector<allele> return_alleles){
/*
* @param return_alleles : the alleles we're saving to the amplicon
Expand Down Expand Up @@ -38,6 +37,32 @@ void IntervalTree::combine_haplotypes(ITNode *root){
combine_haplotypes(root->right);
}

void IntervalTree::find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint8_t> qualities){
if (root==NULL) return;
//if ((uint32_t)root->data->low > upper) return;
if(((uint32_t)root->data->low <= lower) && (upper <= (uint32_t)root->data->high)){
for(uint32_t i=0; i < positions.size(); i++){
for(uint32_t j=0; j < root->amp_positions.size(); j++){
if(positions[i] == root->amp_positions[j].pos){
root->amp_positions[j].update_alleles(bases[i], 1, qualities[i], false);
}
}
}
}
find_read_amplicon(root->right, lower, upper, positions, bases, qualities);
}

void IntervalTree::amplicon_position_pop(ITNode *root){
if (root==NULL) return;
for(uint32_t i=root->data->low; i < root->data->high; i++){
position add_pos;
add_pos.pos = i;
add_pos.alleles = populate_basic_alleles();
root->amp_positions.push_back(add_pos);
}
amplicon_position_pop(root->right);
}

void IntervalTree::detect_amplicon_overlaps(ITNode *root, uint32_t find_position){
if (root==NULL) return;
if (find_position < (uint32_t)root->data->low) return;
Expand Down Expand Up @@ -90,10 +115,11 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32
std::vector<uint8_t> sequence;
std::vector<uint8_t> quality;

uint32_t quality_threshold=20;

uint8_t nt = 0;
uint32_t length = 0;
uint32_t ll = 0;
uint32_t op;
uint32_t length = 0, ll=0, op=0;

//auto start = high_resolution_clock::now();
for(uint32_t i=0; i < nlength; i++){
op = bam_cigar_op(cigar[i]);
Expand All @@ -105,9 +131,19 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32
ll += length;
}
}
//TESTLINES
uint32_t qual_threshold = 20;
quality.reserve(useful.size());
sequence.reserve(useful.size());
for(uint32_t k=0; k < useful.size(); k++){
nt = seq_nt16_str[bam_seqi(seq, useful[k])];
sequence.push_back(nt);
if(qual[useful[k]] < qual_threshold){
sequence.push_back('L');
total_qual += qual[useful[k]];
total_bases++;
} else {
nt = seq_nt16_str[bam_seqi(seq, useful[k])];
sequence.push_back(nt);
}
quality.push_back(qual[useful[k]]);;
}
useful.clear();
Expand All @@ -127,18 +163,22 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32
}
nuc.clear();
nuc = convert.str();

int avg_q = (int)q/nuc.size();
char ch = 'L';
std::string nuc = "+" + convert.str();
//check if this position exists
int exists = check_position_exists(start_pos+consumed_ref, variants);
if (exists != -1) {
if (exists != -1 && nuc.find(ch) == std::string::npos) {
variants[exists].update_alleles(nuc, 1, avg_q, false);
} else {
//add position to vector
position add_pos;
add_pos.pos = start_pos+consumed_ref; //don't add a position
add_pos.update_alleles(nuc, 1, avg_q, false);
variants.push_back(add_pos);
if (nuc.find(ch) == std::string::npos) {
add_pos.update_alleles(nuc, 1, avg_q, false);
variants.push_back(add_pos);
}
}
consumed_query += oplen;
continue;
Expand Down Expand Up @@ -263,6 +303,9 @@ void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32
bool ref = false;
convert << sequence[j];
std::string nuc = convert.str();
if((uint32_t)quality[j] < quality_threshold){
continue;
}
if (std::find(substitutions.begin(), substitutions.end(), current_pos) == substitutions.end()){
ref = true;
}
Expand Down Expand Up @@ -305,11 +348,6 @@ void IntervalTree::set_haplotypes(ITNode *root, primer prim){
for (uint32_t i=0; i < tmp_pos.size(); i++){
position add_pos = tmp_pos[i];
bool found = false;
if((prim.get_start() == 27623 || prim.get_start() == 27735) && add_pos.pos == 27752){
for(auto xx : add_pos.alleles){
std::cerr << xx.nuc << " " << xx.depth << std::endl;
}
}
// check if we already have a pos for this pos
for(uint32_t j=0; j < root->amp_positions.size(); j++){
position here_pos = root->amp_positions[j];
Expand Down
12 changes: 9 additions & 3 deletions src/interval_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class IntervalTree {
void insert(ITNode *root, Interval data);
bool envelopSearch(ITNode *root, Interval data);
void inOrder(ITNode *root);
void amplicon_position_pop(ITNode *root);
void print_amplicons(ITNode *root);
void get_max_pos(ITNode *root);
void set_haplotypes(ITNode *root, primer prim);
Expand All @@ -57,12 +58,15 @@ class IntervalTree {
void detect_abberations(ITNode *root, uint32_t pos);
void detect_amplicon_overlaps(ITNode *root, uint32_t pos);
void detect_primer_issues(ITNode *root, uint32_t pos);
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint8_t> qualities);
public:
uint32_t max_pos=0;
std::vector<std::vector<uint32_t>> overlaps;
std::vector<position> test_flux; //storage for looking at pos across all amps
std::vector<uint32_t> test_test;
std::vector<position> variants; //all variants across every position
uint32_t total_bases=0;
uint32_t total_qual=0;
std::vector<uint32_t> flagged_positions; //positions where freq flux occurs MIGHT NOT NEED
IntervalTree(); // constructor
void insert(Interval data) { insert(_root, data); }
Expand All @@ -78,7 +82,8 @@ class IntervalTree {
void combine_haplotypes() {combine_haplotypes(_root);}
void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality, std::string qname);
void populate_variants();

void find_read_amplicon(uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint8_t> qualities) {find_read_amplicon(_root, lower, upper, positions, bases, qualities);}
void amplicon_position_pop() {amplicon_position_pop(_root);}
};

void combine_haplotypes();
Expand All @@ -90,6 +95,7 @@ void populate_variants();
int unpaired_primers(ITNode *root, primer prim);
void detect_primer_issues(ITNode *root, uint32_t find_position);
void detect_amplicon_overlaps(ITNode *root, uint32_t find_position);
IntervalTree populate_amplicons(std::string pair_info_file,
std::vector<primer> &primers);
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint8_t> qualities);
IntervalTree populate_amplicons(std::string pair_info_file, std::vector<primer> &primers);
IntervalTree amplicon_position_pop();
#endif
46 changes: 25 additions & 21 deletions src/primer_bed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ int populate_pair_indices(std::vector<primer>& primers, std::string path) {
return 0;
}

void primer::transform_mutations(std::string ref_path) {
void primer::transform_mutations() {
/*
* Take all recorded cigarotypes and transform them into positions relative to primer
*/
Expand All @@ -357,9 +357,8 @@ void primer::transform_mutations(std::string ref_path) {
std::vector<std::vector<std::string>> qnames = this->get_qnames();
std::vector<std::vector<uint32_t>> qualities = this->get_qualities();
std::vector<bool> all_forward = this->get_direction();
//this tracks all mutations at all positions

std::string test = "";
//ref_antd refantd(ref_path, "");

//here let's turn the cigar string into a vector of alleles specific to this primer
//iterate all unique sequences
Expand All @@ -381,6 +380,7 @@ void primer::transform_mutations(std::string ref_path) {
uint32_t position_val = start_positions[i];
uint32_t relative_position = 0;
std::vector<std::vector<uint32_t>> soft_clipped, relative_soft_clipped;
char ch = 'L';

//we'll use the cigar string to handle insertions
for(uint32_t j=0; j < cigarotype.size(); j++){
Expand All @@ -404,7 +404,6 @@ void primer::transform_mutations(std::string ref_path) {
for(uint32_t j=0; j < cigarotype.size(); j++){
uint32_t op = bam_cigar_op(cigarotype[j]);
uint32_t oplen = bam_cigar_oplen(cigarotype[j]);
//std::cerr << op << " " << oplen << std::endl;
//honestly this whole bit could be better - more general
//insertions
if(op == 1){
Expand All @@ -422,9 +421,9 @@ void primer::transform_mutations(std::string ref_path) {
std::string nuc = "+" + convert.str();
//check if this position exists
int exists = check_position_exists(start_pos+consumed_ref, positions);
if (exists != -1) {
if (exists != -1 && (nuc.find(ch) == std::string::npos)) {
positions[exists].update_alleles(nuc, ccount, avg_q, false);
} else {
} else if (nuc.find(ch) == std::string::npos){
//add position to vector
position add_pos;
add_pos.pos = start_pos+consumed_ref; //don't add a position
Expand Down Expand Up @@ -566,10 +565,9 @@ void primer::transform_mutations(std::string ref_path) {
bool ref = false;
convert << sequence[j];
std::string nuc = convert.str();
if(nuc == "L") continue; //this nucleotide was below the threshold of quality
/*if(current_pos == 23664 && ccount > 5){
std::cerr << nuc << " " << ccount << " " << qname << std::endl;
}*/
if(nuc == "L") {
continue;
} //this nucleotide was below the threshold of quality
if (std::find(substitutions.begin(), substitutions.end(), current_pos) == substitutions.end()){
ref = true;
}
Expand All @@ -588,6 +586,7 @@ void primer::transform_mutations(std::string ref_path) {

//cigar must be passed by pointer due to array decay
void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, std::string qname, uint8_t *quality, bool is_rev) {

bool found = false; //have we seen this before
std::vector<uint8_t> saved_aux; //placeholder for saved sequence
std::vector<uint32_t> saved_cigarotype; //placeholder for saved cigarotypes
Expand All @@ -600,14 +599,13 @@ void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t n

std::vector<uint8_t> aux_reformat;
std::string test = "";
uint32_t length=0;
uint32_t ll=0;
uint32_t length=0, ll=0;

std::vector<uint32_t> useful;
//first handle the array decay aspect
for(uint32_t i=0; i < nlength; i++){
cigar_reformat.push_back(cigar[i]);
uint32_t op = bam_cigar_op(cigar[i]);
//std::cerr << op << " " << bam_cigar_oplen(cigar[i]) << std::endl;
//consumes query
if (bam_cigar_type(op) & 1){
length = bam_cigar_oplen(cigar[i]);
Expand Down Expand Up @@ -636,14 +634,13 @@ void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t n
uint32_t qual_threshold = 20;
char sub_nt = 'L';
for(uint32_t k=0; k < useful.size(); k++){
bool above_thresh=true;
qual_reformat.push_back((uint32_t)quality[useful[k]]);
if((uint32_t)quality[useful[k]] < qual_threshold){
seq_reformat.push_back(sub_nt);
above_thresh = false;
low_count++;
low_qual += (uint32_t)quality[useful[k]];
}
qual_reformat.push_back((uint32_t)quality[useful[k]]);
if(substitution && above_thresh){
uint8_t nt = 0;
if(substitution){
uint8_t nt = 0;
nt = seq_nt16_str[bam_seqi(seq, useful[k])]; //this operation is costly....
seq_reformat.push_back(nt);
}
Expand Down Expand Up @@ -676,8 +673,12 @@ void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t n
if(seq_reformat.empty()){
seq_reformat.reserve(useful.size());
for(uint32_t k=0; k < useful.size(); k++){
uint8_t nt = seq_nt16_str[bam_seqi(seq, useful[k])];
seq_reformat.push_back(nt);
if(qual_reformat[k] >= qual_threshold){
uint8_t nt = seq_nt16_str[bam_seqi(seq, useful[k])];
seq_reformat.push_back(nt);
} else{
seq_reformat.push_back(sub_nt);
}
}
}
if(is_rev){
Expand All @@ -696,6 +697,9 @@ void cigarotype::add_cigarotype(uint32_t *cigar , uint32_t start_pos, uint32_t n
}
}




primer get_min_start(std::vector<primer> primers) {
std::vector<primer>::iterator it;
auto minmax_start = std::minmax_element(
Expand Down
Loading

0 comments on commit cd8b450

Please sign in to comment.