diff --git a/src/gmm.cpp b/src/gmm.cpp index 2ca51215..62af3741 100644 --- a/src/gmm.cpp +++ b/src/gmm.cpp @@ -2,7 +2,119 @@ #include #include +void perm_generator(int n, int k, std::vector> &possible_permutations){ + std::vector d(n); + std::iota(d.begin(),d.end(),0); + do { + std::vector tmp; + for (int i = 0; i < k; i++){ + tmp.push_back(d[i]); + } + possible_permutations.push_back(tmp); + std::reverse(d.begin()+k,d.end()); + } while(std::next_permutation(d.begin(),d.end())); +} + +std::vector compare_cluster_assignment(std::vector> prob_matrix, std::vector assigned){ + double threshold = 3; + std::vector flagged_idx; + for(uint32_t i=0; i < prob_matrix.size(); i++){ + double assigned_prob = prob_matrix[i][assigned[i]]; + std::vector tmp = prob_matrix[i]; + std::sort(tmp.begin(), tmp.end(), std::greater()); + for(uint32_t j=0; j < tmp.size(); j++){ + if(tmp[j] >= assigned_prob) continue; + if(exp(tmp[j]) * threshold > exp(assigned_prob)){ + flagged_idx.push_back(i); + } + break; + } + } + return(flagged_idx); +} + +std::vector calculate_joint_probabilities(std::vector> prob_matrix, std::vector> permutations){ + /* + * Calcualte the best assignments maximizing the probability. Prob matrix is (n_clusters, n_variants) + */ + std::vector scores; //score for every permuation of assignment + //on the permutation level + for(uint32_t i=0; i < permutations.size(); i++){ + double score = 0; + for(uint32_t j = 0; j < permutations[i].size(); j++){ + //num of variants in position must match permutation + if(permutations[i].size() != prob_matrix.size()){ + score = -1000; + continue; + } + score += prob_matrix[j][permutations[i][j]]; + } + scores.push_back(score); + } + + uint32_t max_idx = std::distance(scores.begin(), std::max_element(scores.begin(), scores.end())); + return(permutations[max_idx]); +} + +std::vector> assign_variants_simple(std::vector filtered_frequencies, std::vector filtered_positions, std::vector> prob_matrix){ + /* + * Here we assign variants to the highest probability group on a per position basis making sure two variants at the same poistion don't end up in the same group. + */ + uint32_t n = prob_matrix.size(); + std::vector> clusters; + //populate empty cluster vectors + for(uint32_t i=0; i < n; i++){ + std::vector tmp; + clusters.push_back(tmp); + } + //find the unique positions + std::vector unique_pos; + for(uint32_t i = 0; i < filtered_positions.size(); i++){ + if (std::find(unique_pos.begin(), unique_pos.end(), filtered_positions[i]) == unique_pos.end()) { + unique_pos.push_back(filtered_positions[i]); + } + } + //determine all possible permutations of assignments, disregard sum condition of E(u) ~= 1 + std::vector idx_combinations(n); + std::iota(idx_combinations.begin(), idx_combinations.end(), 0); + std::vector> possible_permutations; + + perm_generator(n, 1, possible_permutations); + perm_generator(n, 2, possible_permutations); + perm_generator(n, 3, possible_permutations); + perm_generator(n, 4, possible_permutations); + + //now we loop every unique position and assign the max prob combo of variants + for(uint32_t i=0; i < unique_pos.size(); i++){ + std::vector pos_idxs; + std::vector> tmp_prob; + //all locations in the prob matrix for this position + for(uint32_t j = 0; j < filtered_positions.size(); j++){ + if(filtered_positions[j] == unique_pos[i]){ + pos_idxs.push_back(j); + std::vector tmp; + for(uint32_t k=0; k < n; k++){ + tmp.push_back(prob_matrix[k][j]); + } + tmp_prob.push_back(tmp); + } + } + //assign variants based on most probable position-wise + std::vector assigned = calculate_joint_probabilities(tmp_prob, possible_permutations); + //make sure the assignment is concrete + std::vector assignment_flagged = compare_cluster_assignment(tmp_prob, assigned); + for(uint32_t j=0; j < pos_idxs.size(); j++){ + if(std::find(assignment_flagged.begin(), assignment_flagged.end(), j) != assignment_flagged.end()) { + continue; + } + uint32_t idx = assigned[j]; + clusters[idx].push_back(filtered_frequencies[pos_idxs[j]]); + } + } + return(clusters); +} void go(uint32_t offset, uint32_t k, std::vector means, std::vector combination, std::vector> &combos, double error) { + //generates all the combinations if (k == 0) { double sum_of_elems = std::accumulate(combination.begin(), combination.end(), 0.0); std::cerr << sum_of_elems << std::endl; @@ -24,7 +136,7 @@ void solve_solution_sets(std::vector means, uint32_t n){ //generate the combinations std::vector combination; //placeholder to use in function std::vector> combos; - double errors = 0.05 + double error = 0.05; go(0, n, means, combination, combos, error); for(uint32_t i=0; i < combos.size(); i++){ for(uint32_t j=0; j < combos[i].size(); j++){ @@ -35,24 +147,34 @@ void solve_solution_sets(std::vector means, uint32_t n){ } -void determine_outlier_variants(std::vector> prob_matrix, std::vector filtered_frequencies, std::vector &outlier_idxs){ - /* - * Here we take in prob of every variants being assigned to every gaussian and determine which variants can be concretely assigned and which cannot. - */ - double threshold = 3; - //iterate the transposed version - for(uint32_t i=0; i < prob_matrix[0].size(); i++) { - std::vector tmp; - for(uint32_t j=0; j < prob_matrix.size(); j++){ - tmp.push_back(exp(prob_matrix[j][i])); - } - std::sort(tmp.begin(),tmp.end(), std::greater()); - double ratio = tmp[0] / tmp[1]; - if (tmp[1] * threshold > tmp[0]){ - outlier_idxs.push_back(i); - std::cerr << ratio << " " << filtered_frequencies[i] << std::endl; +void determine_outlier_variants(std::vector> clusters, std::vector &outlier_idxs, std::vector means){ + + //look at spread of variants assigned to each cluster to determine outliers + /*for(uint32_t i=0; i < clusters.size(); i++){ + for(uint32_t j=0; j < clusters[i].size(); j++){ + std::cerr << clusters[i][j] << " "; } - } + std::cerr << std::endl; + auto const Q1 = clusters[i].size() / 4; + auto const Q2 = clusters[i].size() / 2; + auto const Q3 = Q1 + Q2; + + std::nth_element(clusters[i].begin(), clusters[i].begin() + Q1, clusters[i].end()); + std::nth_element(clusters[i].begin() + Q1 + 1, clusters[i].begin() + Q2, clusters[i].end()); + std::nth_element(clusters[i].begin() + Q2 + 1, clusters[i].begin() + Q3, clusters[i].end()); + std::cerr << Q1 << " " << clusters[i][Q1] << std::endl; + std::cerr << Q3 << " " << clusters[i][Q3] << std::endl; + double sum = std::accumulate(clusters[i].begin(), clusters[i].end(), 0.0); + double mean = sum / clusters[i].size(); + double sq_sum = std::inner_product(clusters[i].begin(), clusters[i].end(), clusters[i].begin(), 0.0); + double stdev = std::sqrt(sq_sum / clusters[i].size() - mean * mean); + + double upper_bound = (stdev*3) + mean; + double lower_bound = mean - (stdev*3); + std::cerr << "lower " << lower_bound << " upper " << upper_bound << std::endl; + std::cerr << "mean " << mean << " std dev " << stdev << std::endl; + }*/ + } void split(const std::string &s, char delim, std::vector &elems){ @@ -103,20 +225,24 @@ int gmm_model(std::string prefix){ float upper_bound = 0.97; std::string filename = prefix + ".txt"; + std::vector frequencies; std::vector positions; std::vector depths; std::vector nucs; std::vector flagged; parse_internal_variants(filename, frequencies, positions, depths, flagged, nucs); + //the n min and n max represent the range of n values - uint32_t n = 2; + uint32_t n = 3; //filter out the data we won't use in our initial model std::vector filtered_frequencies; + std::vector filtered_positions; for(uint32_t i=0; i < frequencies.size(); i++){ if(depths[i] > 10 && flagged[i] == "FALSE" && frequencies[i] > lower_bound && frequencies[i] < upper_bound){ filtered_frequencies.push_back(frequencies[i]); + filtered_positions.push_back(positions[i]); } } @@ -135,10 +261,15 @@ int gmm_model(std::string prefix){ //get the means of the gaussians std::vector means; + //TEST LINES + for(uint32_t i=0; i < filtered_frequencies.size(); i++){ + std::cerr << filtered_frequencies[i] << " " << filtered_positions[i] << " " << model.log_p(data.col(i), 0) << " " << model.log_p(data.col(i), 1) << " " << model.log_p(data.col(i), 2) << std::endl; + } + //get the probability of each frequency being assigned to each gaussian std::vector> prob_matrix; for(uint32_t i=0; i < n; i++){ - means.push_back((double)model.means[i]); + //means.push_back((double)model.means[i]); arma::rowvec set_likelihood = model.log_p(data.cols(0,filtered_frequencies.size()-1), i); std::vector tmp; for(uint32_t j=0; j < filtered_frequencies.size(); j++){ @@ -146,14 +277,23 @@ int gmm_model(std::string prefix){ } prob_matrix.push_back(tmp); } - + + //assign variants out based on probability, not taking into account condition of all variants for a pos ~= 1 + std::vector> clusters = assign_variants_simple(filtered_frequencies, filtered_positions, prob_matrix); + + //calculate cluster means + for(uint32_t i=0; i < clusters.size(); i++){ + double m = accumulate(clusters[i].begin(), clusters[i].end(), 0.0) / clusters[i].size(); + std::cerr << m << std::endl; + means.push_back(m); + } //based on the probabilities, determine which points cannot be concretely assigned std::vector outlier_idxs; - double raw_distance_threshold = 0.20; //here we define any point existing too far outside of a cluster as not possible - determine_outlier_variants(prob_matrix, filtered_frequencies, outlier_idxs, raw_distance_threshold); - + determine_outlier_variants(clusters, outlier_idxs, means); + exit(1); + //solve for solutions adding to ~1 - solve_solution_sets(means, n); + //solve_solution_sets(means, n); //places where we'll call an N value //std::vector unassigned_points; diff --git a/src/gmm.h b/src/gmm.h index e50faa0f..0ad39bde 100644 --- a/src/gmm.h +++ b/src/gmm.h @@ -1,5 +1,14 @@ #ifndef gmm #define gmm +struct variant { + uint32_t position; + std::string nuc; + uint32_t depth; + float qual; + float freq; + bool vague_assignment; +} + int gmm_model(std::string prefix); #endif