Skip to content

Commit

Permalink
BayesTyper (v1.5 beta)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jonas Andreas Sibbesen authored and Jonas Andreas Sibbesen committed Mar 13, 2019
1 parent 8b1436c commit 62888d6
Show file tree
Hide file tree
Showing 33 changed files with 517 additions and 449 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ SET(BUILD_STATIC 0 CACHE BOOL "Build static")

if(${BUILD_STATIC} EQUAL 0)

SET(CMAKE_CXX_FLAGS "--std=c++11 -lpthread -g -O3 -DBT_KMER_SIZE='${KMER_SIZE}' -DBT_VERSION='\"v1.4.1 ${GIT_LAST_COMMIT_HASH}\"'")
SET(CMAKE_CXX_FLAGS "--std=c++11 -lpthread -g -O3 -DBT_KMER_SIZE='${KMER_SIZE}' -DBT_VERSION='\"v1.5 beta ${GIT_LAST_COMMIT_HASH}\"'")

else(${BUILD_STATIC} EQUAL 0)

message(STATUS "Building BayesTyper static")

SET(CMAKE_CXX_FLAGS "--std=c++11 -pthread -O3 -static -static-libgcc -static-libstdc++ -DBT_KMER_SIZE='${KMER_SIZE}' -DBT_VERSION='\"v1.4.1\"'")
SET(CMAKE_CXX_FLAGS "--std=c++11 -pthread -O3 -static -static-libgcc -static-libstdc++ -DBT_KMER_SIZE='${KMER_SIZE}' -DBT_VERSION='\"v1.5 beta\"'")
SET(CMAKE_EXE_LINKER_FLAGS "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive")

SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
Expand Down
8 changes: 1 addition & 7 deletions include/bayesTyper/Filters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,15 @@ class Filters {

public:

Filters(const OptionsContainer & options_container, const vector<vector<NegativeBinomialDistribution> > &, const ushort);
Filters(const OptionsContainer & options_container, const vector<vector<NegativeBinomialDistribution> > &);

ushort minFilterSamples() const;

ushort minHomozygoteGenotypes() const;
float minGenotypePosterior() const;
float minNumberOfKmers() const;

float minFractionObservedKmers(const ushort) const;

private:

ushort min_filter_samples;

ushort min_homozygote_genotypes;
float min_genotype_posterior;
float min_number_of_kmers;

Expand Down
32 changes: 18 additions & 14 deletions include/bayesTyper/FrequencyDistribution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,24 @@ class FrequencyDistribution {

public:

FrequencyDistribution(const ushort, const uint);
FrequencyDistribution(const uint, const uint);
virtual ~FrequencyDistribution() {};

virtual void reset();
virtual void initialize(const vector<uint> &);
virtual void setSparsity(const double);

pair<bool, double> getElementFrequency(const ushort);
ushort getNumElements();
pair<bool, double> getElementFrequency(const uint);
uint getNumElements();

virtual ~FrequencyDistribution() {};
virtual void incrementObservationCount(const ushort);
virtual void incrementObservationCount(const uint);
virtual void sampleFrequencies(const uint);

protected:

static const double dirichlet_parameter;
const uint num_elements;

vector<ushort> observation_counts;
vector<uint> observation_counts;
vector<double> frequencies;
vector<bool> non_zero_frequencies;

Expand All @@ -69,23 +71,25 @@ class SparseFrequencyDistribution : public FrequencyDistribution {

public:

SparseFrequencyDistribution(const ushort, const uint, const double);
SparseFrequencyDistribution(const double, const uint, const uint);

void reset();
void initialize(const vector<uint> &);
void setSparsity(const double);

void incrementObservationCount(const ushort);
void incrementObservationCount(const uint);
void sampleFrequencies(const uint);

private:

void updateCachedSimplexProbVector(vector<double> *, const uint, const ushort);
void updateCachedSimplexProbVector(vector<double> *, const uint, const uint);

const double sparsity;
double sparsity;

unordered_map<uint, unordered_map<ushort, vector<double> > > cached_simplex_prob_vectors;
unordered_map<uint, unordered_map<uint, vector<double> > > cached_simplex_prob_vectors;

unordered_set<ushort> plus_count_indices;
unordered_set<ushort> zero_count_indices;
unordered_set<uint> plus_count_indices;
unordered_set<uint> zero_count_indices;

uniform_int_distribution<> uniform_int_dist;
};
Expand Down
2 changes: 1 addition & 1 deletion include/bayesTyper/GenotypeWriter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class GenotypeWriter {
void writeAlleleField(const vector<ValueType> &);

void writeAlleleSequences(const VariantInfo &, const string &);
void writeQualityAndFilter(const Genotypes::VariantStats &, const ushort, const Filters &);
void writeQualityAndFilter(const Genotypes::VariantStats &, const Filters &);
void writeVariantStats(const Genotypes::VariantStats &, const ushort);
void writeAlleleCover(vector<ushort> *, const ushort);
void writeAlleleOrigin(const VariantInfo &);
Expand Down
6 changes: 2 additions & 4 deletions include/bayesTyper/Genotypes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ class Genotypes {
ushort num_candidates;
vector<ushort> non_covered_alleles;

ushort num_homozygote_genotypes;

struct VariantStats {

uint total_count;
Expand All @@ -83,7 +81,7 @@ class Genotypes {
struct SampleStats {

vector<ushort> genotype_estimate;
bool is_homozygote;
uint genotype_quality;

vector<float> genotype_posteriors;

Expand All @@ -92,7 +90,7 @@ class Genotypes {

vector<ushort> allele_filters;

SampleStats(const ushort num_alleles, const uint num_genotypes) : is_homozygote(false), genotype_posteriors(num_genotypes, 0), allele_posteriors(num_alleles, 0), allele_filters(num_alleles, 0) {
SampleStats(const ushort num_alleles, const uint num_genotypes) : genotype_quality(0), genotype_posteriors(num_genotypes, 0), allele_posteriors(num_alleles, 0), allele_filters(num_alleles, 0) {

genotype_estimate.reserve(2);
}
Expand Down
24 changes: 14 additions & 10 deletions include/bayesTyper/HaplotypeFrequencyDistribution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,10 @@ class HaplotypeFrequencyDistribution {
uint numMissingCount();

virtual void reset() = 0;
virtual void initialize(const vector<uint> &, const uint) = 0;

virtual pair<bool, double> getElementFrequency(const ushort) = 0;
virtual void incrementObservationCount(const ushort) = 0;
virtual pair<bool, double> getFrequency(const ushort) = 0;
virtual void incrementCount(const ushort) = 0;

virtual void sampleFrequencies() = 0;

Expand All @@ -63,10 +64,11 @@ class UniformHaplotypeFrequencyDistribution : public HaplotypeFrequencyDistribut
UniformHaplotypeFrequencyDistribution(const ushort);
~UniformHaplotypeFrequencyDistribution() {};

void reset();
void reset() {};
void initialize(const vector<uint> &, const uint) {};

pair<bool, double> getElementFrequency(const ushort);
void incrementObservationCount(const ushort);
pair<bool, double> getFrequency(const ushort);
void incrementCount(const ushort);

void sampleFrequencies();

Expand All @@ -79,19 +81,21 @@ class SparseHaplotypeFrequencyDistribution : public HaplotypeFrequencyDistributi

public:

SparseHaplotypeFrequencyDistribution(FrequencyDistribution *);
SparseHaplotypeFrequencyDistribution(const vector<uint> &, const uint, const uint);
~SparseHaplotypeFrequencyDistribution();

void reset();
void initialize(const vector<uint> &, const uint);

pair<bool, double> getElementFrequency(const ushort);
void incrementObservationCount(const ushort);
pair<bool, double> getFrequency(const ushort);
void incrementCount(const ushort);

void sampleFrequencies();

private:
protected:

FrequencyDistribution * frequency_distribution;
};

#endif
#endif

7 changes: 4 additions & 3 deletions include/bayesTyper/InferenceEngine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class InferenceEngine {
const ushort num_gibbs_chains;
const float kmer_subsampling_rate;
const uint max_haplotype_variant_kmers;
const bool min_cover_haplotype_init;

struct VariantClusterGroupBatch {

Expand All @@ -87,9 +88,9 @@ class InferenceEngine {
VariantClusterGroupBatch(const uint first_variant_cluster_group_idx_in, const uint num_variants_in, const vector<VariantClusterGroup*>::iterator start_it_in, const vector<VariantClusterGroup*>::iterator end_it_in) : first_variant_cluster_group_idx(first_variant_cluster_group_idx_in), num_variants(num_variants_in), start_it(start_it_in), end_it(end_it_in) {}
};

void initNoiseEstimationGroupsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, KmerCountsHash *, const ushort, const ushort);
void sampleNoiseCountsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, CountAllocation *, mutex *, const CountDistribution &, const ushort);
void resetNoiseEstimationGroups(vector<VariantClusterGroup *> *, const vector<uint> &);
void initNoiseEstimationGroupsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, KmerCountsHash *, const ushort, const ushort, const uint);
void sampleNoiseCountsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, CountAllocation *, mutex *, const CountDistribution &, const ushort, const uint);
void resetNoiseEstimationGroupsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, const ushort, const uint);

void genotypeVariantClusterGroupsCallback(ProducerConsumerQueue<VariantClusterGroupBatch> *, KmerCountsHash *, const CountDistribution &, const Filters & filters, GenotypeWriter *, uint *, mutex *);
};
Expand Down
3 changes: 2 additions & 1 deletion include/bayesTyper/SparsityEstimator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ class SparsityEstimator {

public:

SparsityEstimator() {};
SparsityEstimator(const uint);

unordered_set<ushort> estimateMinimumColumnCover(const Utils::MatrixXuchar &, Utils::RowVectorXbool *);
vector<uint> estimateMinimumColumnCover(const Utils::MatrixXuchar &, const Utils::RowVectorXbool &, const bool);

private:

Expand Down
10 changes: 7 additions & 3 deletions include/bayesTyper/VariantClusterGenotyper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ THE SOFTWARE.
#include "Sample.hpp"
#include "KmerStats.hpp"
#include "Filters.hpp"
#include "SparsityEstimator.hpp"


using namespace std;
Expand All @@ -68,7 +69,7 @@ class VariantClusterGenotyper {
VariantClusterGenotyper(VariantClusterGraph *, KmerCountsHash *, const vector<Sample> &, const uint, const uchar);
~VariantClusterGenotyper();

void reset(const float, const uint);
void reset(const float, const uint, const bool);

void sampleDiplotypes(const CountDistribution &, const vector<VariantClusterHaplotypes::NestedVariantClusterInfo> &, const bool);
void getNoiseCounts(CountAllocation *, const CountDistribution &);
Expand All @@ -85,7 +86,7 @@ class VariantClusterGenotyper {
void updateMulticlusterDiplotypeLogProb(const CountDistribution &, const ushort);
double calcDiplotypeLogProb(const CountDistribution &, const ushort, const pair<ushort, ushort> &);

void sampleDiplotype(const CountDistribution &, const ushort, const Utils::Ploidy);
void sampleDiplotype(const vector<ushort> &, const CountDistribution &, const ushort, const Utils::Ploidy);

ushort haplotypeToAlleleIndex(const ushort, const ushort);
vector<ushort> getNonCoveredAlleles(const ushort);
Expand All @@ -99,7 +100,10 @@ class VariantClusterGenotyper {
bool use_multicluster_kmers;

VariantClusterHaplotypes variant_cluster_haplotypes;


SparsityEstimator sparsity_estimator;
Utils::RowVectorXbool non_zero_kmer_counts;

vector<VariantInfo> variant_cluster_info;
vector<vector<AlleleKmerStats> > allele_kmer_stats;

Expand Down
6 changes: 3 additions & 3 deletions include/bayesTyper/VariantClusterGraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class VariantClusterGraph {
virtual ~VariantClusterGraph() {};

const vector<VariantInfo> & getInfo() const;
bool isSimpleCluster();
bool hasExcludedKmers();

void findSamplePaths(KmerBloom<Utils::kmer_size> *, const uint, const ushort);

Expand All @@ -92,7 +92,7 @@ class VariantClusterGraph {
Graph graph;

ulong num_path_kmers;
bool is_simple_cluster;
bool has_excluded_kmers;

vector<VariantInfo> variant_cluster_info;
vector<bool> best_paths_indices;
Expand All @@ -114,7 +114,7 @@ class VariantClusterGraph {

ar & graph;
ar & num_path_kmers;
ar & is_simple_cluster;
ar & has_excluded_kmers;
ar & variant_cluster_info;
ar & best_paths_indices;
}
Expand Down
13 changes: 5 additions & 8 deletions include/bayesTyper/VariantClusterGroup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,6 @@ class VariantClusterGroup {

uint num_variants;

bool is_parameter_cluster;

vector<VariantClusterVertex> vertices;
vector<vector<uint> > out_edges;

Expand All @@ -98,7 +96,6 @@ class VariantClusterGroup {
ar & start_position;
ar & end_position;
ar & num_variants;
ar & is_parameter_cluster;
ar & vertices;
ar & out_edges;
ar & source_vertices;
Expand All @@ -112,22 +109,22 @@ class VariantClusterGroup {
VariantClusterGroup(const vector<VariantCluster *> &, const vector<VariantClusterGraph *> &, const unordered_map<uint, uint> &);
~VariantClusterGroup();

double getComplexity() const;

uint numberOfVariants() const;
uint numberOfVariantClusters() const;
uint numberOfVariantClusterGroupTrees() const;

string region() const;

void findSamplePaths(KmerBloom<Utils::kmer_size> *, const uint, const ushort);

void countPathKmers(unordered_set<bitset<Utils::kmer_size * 2> > *);
void classifyPathKmers(KmerCountsHash *, KmerBloom<Utils::kmer_size> *);

bool isSimpleParameterCluster() const;
bool hasExcludedKmers() const;

void initGenotyper(KmerCountsHash *, const vector<Sample> &, const uint, const uchar, const float, const uint);
void initGenotyper(KmerCountsHash *, const vector<Sample> &, const uint, const uchar, const float, const uint, const bool);

void resetGenotyper(const float kmer_subsampling_rate, const uint max_haplotype_variant_kmers);
void resetGenotyper(const float, const uint, const bool);
void resetGroup();

void shuffleBranchOrdering(const uint);
Expand Down
2 changes: 1 addition & 1 deletion include/bayesTyperTools/Filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace Filter {
float min_gpp_value;
};

void filter(const string &, const string &, const string &, FilterValues, const uint);
void filter(const string &, const string &, const string &, FilterValues);
}

#endif
1 change: 1 addition & 0 deletions include/vcf++/Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ namespace Utils {
static const uint uint_overflow = numeric_limits<uint>::max();
static const ulong ulong_overflow = numeric_limits<ulong>::max();

static const ushort queue_size_thread_scaling = 2;

inline bool doubleCompare(const double a, const double b) {

Expand Down
9 changes: 4 additions & 5 deletions src/bayesTyper/CountDistribution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ void CountDistribution::setGenomicCountDistributions(const vector<vector<vector<

cerr << "\nERROR: Insufficient number of kmers used for negative binomial parameters estimation for sample " << samples.at(sample_idx).name << " (" << max_kmer_count << " < " << min_nb_kmer_count << "); the genome used is likely too small and/or too repetitive\n" << endl;
exit(1);

} else if (max_kmer_count < (min_nb_kmer_count * 10)) {

cout << "\nWARNING: Low number of kmers used for negative binomial parameters estimation for sample " << samples.at(sample_idx).name << " (" << max_kmer_count << " < " << min_nb_kmer_count * 10 << "); mean and variance estimate might be biased\n" << endl;
}

assert(max_kmer_multiplicity > 0);
Expand All @@ -128,11 +132,6 @@ void CountDistribution::setGenomicCountDistributions(const vector<vector<vector<

cout << "[" << Utils::getLocalTime() << "] Estimated negative binomial (mean = " << nb_mean << ", var = " << nb_var << ") for sample " << samples.at(sample_idx).name << " using " << max_kmer_count << " parameter kmers (ploidy = " << max_kmer_multiplicity << ")" << endl;

if (max_kmer_count < (min_nb_kmer_count * 10)) {

cout << "\nWARNING: Low number of kmers used for negative binomial parameters estimation for sample " << samples.at(sample_idx).name << " (" << max_kmer_count << " < " << min_nb_kmer_count * 10 << "); mean and variance estimate might be biased\n" << endl;
}

genomic_outfile << samples.at(sample_idx).name << "\t" << nb_mean << "\t" << nb_var << endl;
}
}
Expand Down
Loading

0 comments on commit 62888d6

Please sign in to comment.