diff --git a/LICENSE b/LICENSE index 033593b..44f5f63 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ The MIT License (MIT) +Copyright (c) 2018 Enno Roehrig Copyright (c) 2013,2014 Xinyu Zhou Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/src/gmm.cc b/src/gmm.cc index dbf722f..d577476 100644 --- a/src/gmm.cc +++ b/src/gmm.cc @@ -19,6 +19,7 @@ using namespace std; using namespace ThreadLib; static const real_t SQRT_2_PI = 2.5066282746310002; +static const real_t PI_2 = 6.28318530718; #include "fastexp.hh" @@ -29,8 +30,8 @@ static const real_t EPS = 2.2204460492503131e-16; Gaussian::Gaussian(int dim, int covariance_type) : dim(dim), covariance_type(covariance_type) { - if (covariance_type != COVTYPE_DIAGONAL) { - const char *msg = "only diagonal matrix supported."; + if (covariance_type != COVTYPE_DIAGONAL && covariance_type != COVTYPE_FULL) { + const char *msg = "covariance type not supported."; printf("%s\n", msg); throw msg; } @@ -51,7 +52,7 @@ void Gaussian::sample(std::vector &x) { x[i] = random.rand_normal(mean[i], sigma[i]); break; case COVTYPE_FULL: - throw "COVTYPE_FULL not implemented"; + throw "no sampling for COVTYPE_FULL implemented"; break; } } @@ -62,7 +63,7 @@ vector Gaussian::sample() { return x; } -real_t Gaussian::log_probability_of(std::vector &x) { +real_t Gaussian::log_probability_of(const std::vector &x) { assert((int)x.size() == dim); real_t prob = 0; @@ -79,7 +80,35 @@ real_t Gaussian::log_probability_of(std::vector &x) { } break; case COVTYPE_FULL: - throw "COVTYPE_FULL not implemented"; + std::vector x_copy = x; + Eigen::VectorXd d = Eigen::Map(x_copy.data(), x_copy.size()); + d = d-Eigen::Map(mean.data(), mean.size()); + prob = -log(sqrt(std::pow(PI_2,x.size()) * det_covariance)) - 0.5f * d.transpose() * inv_covariance * d; + break; + } + return prob; +} + +real_t Gaussian::mahalanobis_of(const std::vector &x) { + assert((int)x.size() == dim); + real_t prob = 0; + switch (covariance_type) { + case COVTYPE_SPHERICAL: + throw "COVTYPE_SPHERICAL not implemented"; + break; + case COVTYPE_DIAGONAL: + for (int i = 0; i < dim; i ++) { + real_t &s = sigma[i]; + real_t s2 = s * s; + real_t d = (x[i] - mean[i]); + prob += -sqrt(d * d / s2); + } + break; + case COVTYPE_FULL: + std::vector x_copy = x; + Eigen::VectorXd d = Eigen::Map(x_copy.data(), x_copy.size()); + d = d-Eigen::Map(mean.data(), mean.size()); + prob = sqrt(d.transpose() * inv_covariance * d); break; } return prob; @@ -100,11 +129,10 @@ void Gaussian::dump(std::ostream &out) { out << endl; break; case COVTYPE_FULL: - for (auto &row: covariance) { - for (auto &v: row) - out << v << ' '; - out << endl; - } + for(int x=0; x> s; break; case COVTYPE_FULL: - covariance.resize(dim); - for (auto &row: covariance) { - row.resize(dim); - for (auto &v: row) - in >> v; - } + covariance.resize(dim,dim); + for(int x=0; x> covariance(x,y); + setCovariance(covariance); break; } } +void Gaussian::setCovariance(const Eigen::MatrixXd& covariance){ + this->covariance = covariance; + det_covariance = covariance.determinant(); + inv_covariance = covariance.inverse(); +} + // most time consuming function -real_t Gaussian::probability_of(std::vector &x) { +real_t Gaussian::probability_of(const std::vector &x) { assert((int)x.size() == dim); real_t prob = 1.0; @@ -153,13 +186,16 @@ real_t Gaussian::probability_of(std::vector &x) { } break; case COVTYPE_FULL: - throw "COVTYPE_FULL not implemented"; + std::vector x_copy = x; + Eigen::VectorXd d = Eigen::Map(x_copy.data(), x_copy.size()); + d = d-Eigen::Map(mean.data(), mean.size()); + prob = exp(-0.5f * d.transpose() * inv_covariance * d) / sqrt((std::pow(PI_2,x.size()) * det_covariance)); break; } return prob; } -real_t Gaussian::probability_of_fast_exp(std::vector &x, double *buffer) { +real_t Gaussian::probability_of_fast_exp(const std::vector &x, double *buffer) { assert((int)x.size() == dim); real_t prob = 1.0; @@ -181,7 +217,7 @@ real_t Gaussian::probability_of_fast_exp(std::vector &x, double *buffer) } break; case COVTYPE_FULL: - throw "COVTYPE_FULL not implemented"; + prob = probability_of(x); break; } return prob; @@ -194,8 +230,8 @@ GMM::GMM(int nr_mixtures, int covariance_type, covariance_type(covariance_type), trainer(trainer) { - if (covariance_type != COVTYPE_DIAGONAL) { - const char *msg = "only diagonal matrix supported."; + if (covariance_type != COVTYPE_DIAGONAL && covariance_type != COVTYPE_FULL) { + const char *msg = "covariance type not supported."; printf("%s\n", msg); throw msg; } @@ -212,7 +248,7 @@ GMM::~GMM() { } -real_t GMM::log_probability_of(std::vector &x) { +real_t GMM::log_probability_of(const std::vector &x) { real_t prob = 0; for (int i = 0; i < nr_mixtures; i ++) { prob += weights[i] * gaussians[i]->probability_of(x); @@ -220,7 +256,7 @@ real_t GMM::log_probability_of(std::vector &x) { return log(prob); } -real_t GMM::log_probability_of_fast_exp(std::vector &x, double *buffer) { +real_t GMM::log_probability_of_fast_exp(const std::vector &x, double *buffer) { real_t prob = 0; for (int i = 0; i < nr_mixtures; i ++) { @@ -229,29 +265,34 @@ real_t GMM::log_probability_of_fast_exp(std::vector &x, double *buffer) return log(prob); } -real_t GMM::probability_of(std::vector &x) { +real_t GMM::probability_of(const std::vector &x) { real_t prob = 0; for (int i = 0; i < nr_mixtures; i ++) { - prob *= weights[i] * gaussians[i]->probability_of(x); + prob += weights[i] * gaussians[i]->probability_of(x); } return prob; } // time consuming -real_t GMM::log_probability_of(std::vector> &X) { +real_t GMM::log_probability_of(const std::vector> &X) { real_t prob = 0; for (auto &x: X) prob += log_probability_of(x); return prob; } -real_t GMM::log_probability_of_fast_exp(std::vector> &X, double *buffer) { +real_t GMM::log_probability_of_fast_exp(const std::vector> &X, double *buffer) { assert(buffer != NULL); real_t prob = 0; for (auto &x: X) prob += log_probability_of_fast_exp(x, buffer); return prob; } +real_t GMM::mahalanobis_of(const std::vector &x) { + if (nr_mixtures != 1) + throw "Mahalanobis Distance not implemented for more than one gaussian"; + return gaussians[0]->mahalanobis_of(x); +} #if 0 static vector random_vector(int dim, real_t range, Random &random) { @@ -337,7 +378,7 @@ static void Dense2Sparse(const std::vector> &X, } } -void GMMTrainerBaseline::init_gaussians(std::vector> &X) { +void GMMTrainerBaseline::init_gaussians(const std::vector> &X) { assert(gmm->covariance_type == COVTYPE_DIAGONAL); // calculate data variance @@ -384,7 +425,20 @@ void GMMTrainerBaseline::init_gaussians(std::vector> &X) { } for (auto &g: gmm->gaussians) { - g->sigma = initial_sigma; + switch(gmm->covariance_type){ + case COVTYPE_SPHERICAL: + throw "COVTYPE_SPHERICAL not implemented"; + break; + case COVTYPE_DIAGONAL: + g->sigma = initial_sigma; + break; + case COVTYPE_FULL: + Eigen::MatrixXd covariance = Eigen::MatrixXd::Identity(g->dim, g->dim); + for(unsigned i =0; isetCovariance(covariance); + break; + } } gmm->weights.resize(gmm->nr_mixtures); @@ -413,13 +467,10 @@ static void gassian_set_zero(Gaussian *gaussian) { m = 0; for (auto &s: gaussian->sigma) s = 0; - for (auto &row: gaussian->covariance) - for (auto &v: row) - v = 0; - + gaussian->setCovariance(Eigen::MatrixXd::Identity(gaussian->dim, gaussian->dim)); } -void GMMTrainerBaseline::iteration(std::vector> &X) { +void GMMTrainerBaseline::iteration(const std::vector> &X) { int n = (int)X.size(); bool enable_guarded_timer = verbosity >= 2; @@ -513,32 +564,67 @@ void GMMTrainerBaseline::iteration(std::vector> &X) { { GuardedTimer timer("update sigma", enable_guarded_timer); { - real_t min_sigma = sqrt(min_covar); - Threadpool pool(concurrency); - for (int k = 0; k < gmm->nr_mixtures; k ++) { - auto task = [&](int k) { - vector tmp(dim); - auto &gaussian = gmm->gaussians[k]; - for (int i = 0; i < n; i ++) { - sub(X[i], gaussian->mean, tmp); - for (auto &t: tmp) t = t * t; - mult_self(tmp, prob_of_y_given_x[k][i]); - add_self(gaussian->sigma, tmp); + switch (gmm->covariance_type) { + case COVTYPE_SPHERICAL: + throw "COVTYPE_SPHERICAL not implemented"; + break; + case COVTYPE_DIAGONAL: { + real_t min_sigma = sqrt(min_covar); + Threadpool pool(concurrency); + for (int k = 0; k < gmm->nr_mixtures; k ++) { + auto task = [&](int k) { + vector tmp(dim); + auto &gaussian = gmm->gaussians[k]; + for (int i = 0; i < n; i ++) { + sub(X[i], gaussian->mean, tmp); + for (auto &t: tmp) t = t * t; + mult_self(tmp, prob_of_y_given_x[k][i]); + add_self(gaussian->sigma, tmp); + } + mult_self(gaussian->sigma, 1.0 / N_k[k]); + for (auto &s: gaussian->sigma) { + s = sqrt(s); + s = max(min_sigma, s); + } + }; + pool.enqueue(bind(task, k), 1); } - mult_self(gaussian->sigma, 1.0 / N_k[k]); - for (auto &s: gaussian->sigma) { - s = sqrt(s); - s = max(min_sigma, s); + } + break; + case COVTYPE_FULL:{ + real_t min_sigma = sqrt(min_covar); + vector cov(dim*dim); + Threadpool pool(concurrency); + for (int k = 0; k < gmm->nr_mixtures; k ++) { + auto task = [&](int k) { + vector tmp(dim); + auto &gaussian = gmm->gaussians[k]; + for (int datapoint = 0; datapoint < n; datapoint ++) { + sub(X[datapoint], gaussian->mean, tmp); + vector tmp2; + for(int i=0; idim;i++) + for(int j=0; jdim;j++) + tmp2.push_back(tmp[i] * tmp[j]); + mult_self(tmp2, prob_of_y_given_x[k][datapoint]); + add_self(cov, tmp2); + } + mult_self(cov, 1.0 / N_k[k]); + for(int i=0; idim;i++) + cov[i*(gaussian->dim+1)] = max(cov[i*(gaussian->dim+1)], min_sigma); + gaussian->setCovariance(Eigen::Map(cov.data(), gaussian->dim, gaussian->dim)); + + }; + pool.enqueue(bind(task, k), 1); } - }; - pool.enqueue(bind(task, k), 1); + break; + } } } } } -static void threaded_log_probability_of(GMM *gmm, std::vector> &X, std::vector &prob_buffer, int concurrency) { +static void threaded_log_probability_of(GMM *gmm, const std::vector> &X, std::vector &prob_buffer, int concurrency) { int n = (int)X.size(); prob_buffer.resize(n); int batch_size = (int)ceil(n / (real_t)concurrency); @@ -567,7 +653,7 @@ static void threaded_log_probability_of(GMM *gmm, std::vector> &X, int concurrency) { +static real_t threaded_log_probability_of(GMM *gmm, const std::vector> &X, int concurrency) { std::vector prob_buffer; threaded_log_probability_of(gmm, X, prob_buffer, concurrency); real_t prob = 0; @@ -576,17 +662,17 @@ static real_t threaded_log_probability_of(GMM *gmm, std::vector> &X, int concurrency) { +real_t GMM::log_probability_of_fast_exp_threaded(const std::vector> &X, int concurrency) { return threaded_log_probability_of(this, X, concurrency); } void GMM::log_probability_of_fast_exp_threaded( - std::vector> &X, std::vector &prob_out, int concurrency) { + const std::vector> &X, std::vector &prob_out, int concurrency) { threaded_log_probability_of(this, X, prob_out, concurrency); } -void GMMTrainerBaseline::train(GMM *gmm, std::vector> &X) { +void GMMTrainerBaseline::train(GMM *gmm, const std::vector> &X) { if (X.size() == 0) { const char *msg = "X.size() == 0"; printf("%s\n", msg); @@ -652,7 +738,7 @@ void GMMTrainerBaseline::train(GMM *gmm, std::vector> &X) { } void GMM::dump(ostream &out) { - out << nr_mixtures << endl; + out << nr_mixtures << ' ' << dim << ' ' << covariance_type << endl; for (auto &w: weights) out << w << ' '; out << endl; @@ -661,13 +747,13 @@ void GMM::dump(ostream &out) { } void GMM::load(istream &in) { - in >> nr_mixtures; + in >> nr_mixtures >> dim >> covariance_type; weights.resize(nr_mixtures); for (auto &w: weights) in >> w; gaussians.resize(nr_mixtures); for (auto &g: gaussians) { - g = new Gaussian(dim, COVTYPE_DIAGONAL); + g = new Gaussian(dim, covariance_type); g->load(in); } } diff --git a/src/gmm.hh b/src/gmm.hh index 7205cfc..1495537 100644 --- a/src/gmm.hh +++ b/src/gmm.hh @@ -13,6 +13,7 @@ #include #include #include +#include enum CovType { @@ -28,11 +29,14 @@ class Gaussian { int dim, covariance_type; std::vector mean; std::vector sigma; - std::vector> covariance; // not used - real_t log_probability_of(std::vector &x); - real_t probability_of(std::vector &x); - real_t probability_of_fast_exp(std::vector &x, double *buffer = NULL); + + real_t log_probability_of(const std::vector &x); + real_t probability_of(const std::vector &x); + real_t probability_of_fast_exp(const std::vector &x, double *buffer = NULL); + real_t mahalanobis_of(const std::vector &x); + + void setCovariance(const Eigen::MatrixXd& covariance); // sample a point to @x void sample(std::vector &x); @@ -43,12 +47,17 @@ class Gaussian { Random random; int fast_gaussian_dim; + + private: + Eigen::MatrixXd covariance; + double det_covariance; + Eigen::MatrixXd inv_covariance; }; class GMM; class GMMTrainer { public: - virtual void train(GMM *gmm, std::vector> &X) = 0; + virtual void train(GMM *gmm, const std::vector> &X) = 0; virtual ~GMMTrainer() {} }; @@ -57,13 +66,13 @@ class GMMTrainerBaseline : public GMMTrainer { GMMTrainerBaseline(int nr_iter = 10, real_t min_covar = 1e-3, real_t threshold = 0.01, int init_with_kmeans = 1, int concurrency = 1, int verbosity = 0); - virtual void train(GMM *gmm, std::vector> &X); + virtual void train(GMM *gmm, const std::vector> &X); void clear_gaussians(); // init gaussians along with its weight - virtual void init_gaussians(std::vector> &X); + virtual void init_gaussians(const std::vector> &X); - virtual void iteration(std::vector> &X); + virtual void iteration(const std::vector> &X); int dim; @@ -91,7 +100,7 @@ class GMM { ~GMM(); template - void fit(std::vector &X) { + void fit(const std::vector &X) { bool new_trainer = false; if (trainer == NULL) { trainer = new GMMTrainerBaseline(); @@ -114,17 +123,18 @@ class GMM { std::vector weights; std::vector gaussians; - real_t log_probability_of(std::vector &x); - real_t log_probability_of(std::vector> &X); + real_t log_probability_of(const std::vector &x); + real_t log_probability_of(const std::vector> &X); - real_t log_probability_of_fast_exp(std::vector &x, double *buffer = NULL); - real_t log_probability_of_fast_exp(std::vector> &X, double *buffer = NULL); - real_t log_probability_of_fast_exp_threaded(std::vector> &X, int concurrency); - void log_probability_of_fast_exp_threaded( + real_t log_probability_of_fast_exp(const std::vector &x, double *buffer = NULL); + real_t log_probability_of_fast_exp(const std::vector> &X, double *buffer = NULL); + real_t log_probability_of_fast_exp_threaded(const std::vector> &X, int concurrency); + real_t mahalanobis_of(const std::vector &x); + void log_probability_of_fast_exp_threaded(const std::vector> &X, std::vector &prob_out, int concurrency); - real_t probability_of(std::vector &x); + real_t probability_of(const std::vector &x); void normalize_weights(); diff --git a/src/kmeans++.cc b/src/kmeans++.cc index 5b4a4b1..7e92845 100644 --- a/src/kmeans++.cc +++ b/src/kmeans++.cc @@ -107,7 +107,7 @@ real_t KMeansppSolver::cluster(const Dataset &dataset, std::vector ¢ for (auto &p: centroids) p.resize(m); - printf("kmeans++ initializing ...\n"); + //printf("kmeans++ initializing ...\n"); // initial cluster with one random point Instance2Vector(dataset[random.rand_int() % n], centroids[0], m); @@ -145,7 +145,7 @@ real_t KMeansppSolver::cluster(const Dataset &dataset, std::vector ¢ random_weight -= distances[i]; if (random_weight <= 0) { Instance2Vector(dataset[i], centroids[k], m); - printf("kmeans++ iteration %3d: %f, #%d has been choosen\n", k, distsqr_sum, i); fflush(stdout); + //printf("kmeans++ iteration %3d: %f, #%d has been choosen\n", k, distsqr_sum, i); fflush(stdout); break; } } @@ -165,7 +165,7 @@ real_t KMeansppSolver::cluster_weighted( for (auto &p: centroids) p.resize(m); - printf("kmeans++ initializing ...\n"); + //printf("kmeans++ initializing ...\n"); // initial cluster with one random point Instance2Vector(dataset[random.rand_int() % n], centroids[0], m); @@ -204,7 +204,7 @@ real_t KMeansppSolver::cluster_weighted( random_weight -= distances[i]; if (random_weight <= 0) { Instance2Vector(dataset[i], centroids[k], m); - printf("kmeans++ iteration %3d: %f, #%d has been choosen\n", k, distsqr_sum, i); fflush(stdout); + //printf("kmeans++ iteration %3d: %f, #%d has been choosen\n", k, distsqr_sum, i); fflush(stdout); break; } } @@ -216,7 +216,7 @@ real_t KMeansppSolver::cluster_weighted( real_t KMeansppSolver::cluster_weighted(const std::vector &dataset, const std::vector &weight, std::vector ¢roids, int K) { if (dataset.size() == 0) { - printf("[WARNING] no data given"); + //printf("[WARNING] no data given"); return 0; } // TODO: not to waste that much memory, write start from scratch diff --git a/src/kmeans.cc b/src/kmeans.cc index dbd1df9..ed08b18 100644 --- a/src/kmeans.cc +++ b/src/kmeans.cc @@ -149,7 +149,7 @@ using namespace KMeansSolverImpl; real_t KMeansSolver::Lloyds_iteration( const Dataset &dataset, std::vector ¢roids) { - printf("entering Lloyds_iteration\n"); fflush(stdout); + //printf("entering Lloyds_iteration\n"); fflush(stdout); Threadpool p(concurrency); int n = dataset.size(); @@ -188,15 +188,15 @@ real_t KMeansSolver::Lloyds_iteration( distsqr_best = distsqr_sum; centroids_best = centroids; } - printf("iteration %3d: %f\n", iter_id, distsqr_sum); fflush(stdout); + //printf("iteration %3d: %f\n", iter_id, distsqr_sum); fflush(stdout); if (fabs(last_distsqr_sum - distsqr_sum) < 1e-6) { // || fabs(distsqr_sum - last_distsqr_sum) / last_distsqr_sum < 1e-6) - printf("distance squared sum not changing, converged.\n"); + //printf("distance squared sum not changing, converged.\n"); break; } const real_t terminate_cost_factor = 1.5; if (distsqr_sum > distsqr_best * terminate_cost_factor) { - printf("distance square sum %f is worse than best found by a factor of %f. ternimating.\n", distsqr_sum, terminate_cost_factor); + //("distance square sum %f is worse than best found by a factor of %f. ternimating.\n", distsqr_sum, terminate_cost_factor); break; } @@ -233,12 +233,12 @@ real_t KMeansSolver::Lloyds_iteration( last_distsqr_sum = distsqr_sum; if (iter_id == n_iter_max - 1) { - printf("max number of iterations(%d) reached, quit iteration\n", n_iter_max); fflush(stdout); + //printf("max number of iterations(%d) reached, quit iteration\n", n_iter_max); fflush(stdout); } } centroids = centroids_best; - printf("minimum distance squared sum(Lloyds_iteration): %f\n", distsqr_best); fflush(stdout); + //printf("minimum distance squared sum(Lloyds_iteration): %f\n", distsqr_best); fflush(stdout); return distsqr_best; } @@ -248,7 +248,7 @@ real_t KMeansSolver::Lloyds_iteration_weighted( const Dataset &dataset, const std::vector &weight, std::vector ¢roids) { - printf("entering Lloyds_iteration_weighted\n"); fflush(stdout); + //printf("entering Lloyds_iteration_weighted\n"); fflush(stdout); Threadpool p(concurrency); int n = dataset.size(); @@ -293,7 +293,7 @@ real_t KMeansSolver::Lloyds_iteration_weighted( centroids_best = centroids; } - printf("iteration %3d: %f\n", iter_id, distsqr_sum); fflush(stdout); + //printf("iteration %3d: %f\n", iter_id, distsqr_sum); fflush(stdout); if (fabs(last_distsqr_sum - distsqr_sum) < 1e-6)// || fabs(distsqr_sum - last_distsqr_sum) / last_distsqr_sum < 1e-6) break; @@ -329,12 +329,12 @@ real_t KMeansSolver::Lloyds_iteration_weighted( last_distsqr_sum = distsqr_sum; if (iter_id == n_iter_max - 1) { - printf("max number of iterations(%d) reached, quit iteration\n", n_iter_max); fflush(stdout); + //printf("max number of iterations(%d) reached, quit iteration\n", n_iter_max); fflush(stdout); } } centroids = centroids_best; - printf("minimum distance squared sum(Lloyds_iteration_weighted): %f\n", distsqr_best); fflush(stdout); + //printf("minimum distance squared sum(Lloyds_iteration_weighted): %f\n", distsqr_best); fflush(stdout); return distsqr_best; } diff --git a/src/kmeansII.cc b/src/kmeansII.cc index b4d96b0..1bdad44 100644 --- a/src/kmeansII.cc +++ b/src/kmeansII.cc @@ -87,7 +87,7 @@ real_t KMeansIISolver::cluster(const Dataset &dataset, std::vector ¢ for (auto &p: centroids_param) p.resize(m); - printf("kmeansII initializing ...\n"); + //printf("kmeansII initializing ...\n"); // initial cluster with one random point std::vector centroids(1, centroids_param[0]); @@ -142,8 +142,8 @@ real_t KMeansIISolver::cluster(const Dataset &dataset, std::vector ¢ } } - printf("kmeansII iteration %3d: %f, #new: %lu, #all: %lu\n", - iter_id, distsqr_sum, centroids.size() - last_size, centroids.size()); + /*printf("kmeansII iteration %3d: %f, #new: %lu, #all: %lu\n", + iter_id, distsqr_sum, centroids.size() - last_size, centroids.size());*/ fflush(stdout); if (centroids.size() - last_size == 0)