From a2090bb0c60ef5126c52a7fbd277a0f1120db2a6 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 22 Mar 2020 16:20:45 +0100 Subject: [PATCH 1/9] ENH add poisson splitting rule to regression tree --- DESCRIPTION | 2 +- R/RcppExports.R | 4 +- R/predict.R | 3 +- R/ranger.R | 30 +++- man/ranger.Rd | 8 +- src/Forest.cpp | 41 +++--- src/Forest.h | 14 +- src/RcppExports.cpp | 9 +- src/Tree.cpp | 36 ++--- src/Tree.h | 5 +- src/TreeRegression.cpp | 212 +++++++++++++++++++++++++---- src/TreeRegression.h | 16 +++ src/globals.h | 5 +- src/rangerCpp.cpp | 7 +- src/utility.h | 15 ++ tests/testthat/test_poissonsplit.R | 66 +++++++++ tests/testthat/test_quantreg.R | 2 +- 17 files changed, 386 insertions(+), 89 deletions(-) create mode 100644 tests/testthat/test_poissonsplit.R diff --git a/DESCRIPTION b/DESCRIPTION index 46dabaa47..1deb91168 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,6 +17,6 @@ LinkingTo: Rcpp, RcppEigen Depends: R (>= 3.1) Suggests: survival, testthat Encoding: UTF-8 -RoxygenNote: 7.0.2 +RoxygenNote: 7.1.0 URL: https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/R/RcppExports.R b/R/RcppExports.R index 2c775d8b4..fba850cd5 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) { - .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) +rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) { + .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) } numSmaller <- function(values, reference) { diff --git a/R/predict.R b/R/predict.R index 9eebd74ec..3edafcd80 100644 --- a/R/predict.R +++ b/R/predict.R @@ -233,6 +233,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, splitrule <- 1 alpha <- 0 minprop <- 0 + poisson.tau <- 1 case.weights <- c(0, 0) use.case.weights <- FALSE class.weights <- c(0, 0) @@ -269,7 +270,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, prediction.mode, forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule, case.weights, use.case.weights, class.weights, - predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, + predict.all, keep.inbag, sample.fraction, alpha, minprop, poisson.tau, holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, regularization.factor, use.regularization.factor, regularization.usedepth) diff --git a/R/ranger.R b/R/ranger.R index 3d38f21b0..80e1621f3 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -98,10 +98,14 @@ ##' @param sample.fraction Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values. ##' @param case.weights Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees. ##' @param class.weights Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes. -##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". For regression "variance", "extratrees", "maxstat" or "beta" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank". +##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". +##' For regression "variance", "extratrees", "maxstat", "beta" or "poisson" with default "variance". +##' For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank". ##' @param num.random.splits For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable. ##' @param alpha For "maxstat" splitrule: Significance threshold to allow splitting. ##' @param minprop For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting. +##' @param poisson.tau For "poisson" splitrule: The coefficient of variation of the (expected) frequency is \eqn{1/\tau}. +##' If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parend) / (\tau + samples(child) mean(parend))}. ##' @param split.select.weights Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used. ##' @param always.split.variables Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting. ##' @param respect.unordered.factors Handling of unordered factor covariates. One of 'ignore', 'order' and 'partition'. For the "extratrees" splitrule the default is "partition" for all other splitrules 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used. See below for details. @@ -213,6 +217,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, sample.fraction = ifelse(replace, 1, 0.632), case.weights = NULL, class.weights = NULL, splitrule = NULL, num.random.splits = 1, alpha = 0.5, minprop = 0.1, + poisson.tau = 1, split.select.weights = NULL, always.split.variables = NULL, respect.unordered.factors = NULL, scale.permutation.importance = FALSE, @@ -729,6 +734,17 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if ((is.factor(y) && nlevels(y) > 2) || (length(unique(y)) > 2)) { stop("Error: Hellinger splitrule only implemented for binary classification.") } + } else if (splitrule == "poisson") { + if (treetype == 3) { + splitrule.num <- 8 + } else { + stop("Error: poisson splitrule applicable to regression data only.") + } + + ## Check for valid responses + if (min(y) < 0 || sum(y) <= 0) { + stop("Error: poisson splitrule applicable to regression data with non-positive outcome (y>=0 and sum(y)>0) only.") + } } else { stop("Error: Unknown splitrule.") } @@ -754,6 +770,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (num.random.splits > 1 && splitrule.num != 5) { warning("Argument 'num.random.splits' ignored if splitrule is not 'extratrees'.") } + + if (!is.numeric(poisson.tau) || poisson.tau <= 0) { + stop("Error: Invalid value for poisson.tau, please give a positive number.") + } ## Unordered factors if (respect.unordered.factors == "partition") { @@ -790,6 +810,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, stop("Error: Unordered factor splitting not implemented for 'C' splitting rule.") } else if (splitrule == "beta") { stop("Error: Unordered factor splitting not implemented for 'beta' splitting rule.") + } else if (splitrule == "poisson") { + stop("Error: Unordered factor splitting not implemented for 'poisson' splitting rule.") } } @@ -851,9 +873,9 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, prediction.mode, loaded.forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule.num, case.weights, use.case.weights, class.weights, - predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, - num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, - inbag, use.inbag, + predict.all, keep.inbag, sample.fraction, alpha, minprop, poisson.tau, + holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, + order.snps, oob.error, max.depth, inbag, use.inbag, regularization.factor, use.regularization.factor, regularization.usedepth) if (length(result) == 0) { diff --git a/man/ranger.Rd b/man/ranger.Rd index 3342e8087..7e73682cb 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -22,6 +22,7 @@ ranger( num.random.splits = 1, alpha = 0.5, minprop = 0.1, + poisson.tau = 1, split.select.weights = NULL, always.split.variables = NULL, respect.unordered.factors = NULL, @@ -72,7 +73,9 @@ ranger( \item{class.weights}{Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes.} -\item{splitrule}{Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". For regression "variance", "extratrees", "maxstat" or "beta" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank".} +\item{splitrule}{Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". +For regression "variance", "extratrees", "maxstat", "beta" or "poisson" with default "variance". +For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank".} \item{num.random.splits}{For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable.} @@ -80,6 +83,9 @@ ranger( \item{minprop}{For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting.} +\item{poisson.tau}{For "poisson" splitrule: The coefficient of variation of the (expected) frequency is \eqn{1/\tau}. +If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parend) / (\tau + samples(child) mean(parend))}.} + \item{split.select.weights}{Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.} \item{always.split.variables}{Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting.} diff --git a/src/Forest.cpp b/src/Forest.cpp index afaaa46ef..4495495c6 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -29,12 +29,14 @@ namespace ranger { Forest::Forest() : - verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0), seed(0), num_samples( - 0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), memory_saving_splitting( - false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( { 1 }), holdout( - false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth( - DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), num_threads(DEFAULT_NUM_THREADS), data { }, overall_prediction_error( - NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false), progress(0) { + verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0), + seed(0), num_samples(0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), + memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), + sample_fraction( { 1 }), holdout(false), prediction_type(DEFAULT_PREDICTIONTYPE), + num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA), + minprop(DEFAULT_MINPROP), poisson_tau(DEFAULT_POISSON_TAU), num_threads(DEFAULT_NUM_THREADS), data { }, + overall_prediction_error(NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false), + progress(0) { } // #nocov start @@ -44,9 +46,9 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode std::string split_select_weights_file, const std::vector& always_split_variable_names, std::string status_variable_name, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, bool holdout, - PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, - bool regularization_usedepth) { + std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, + double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, + const std::vector& regularization_factor, bool regularization_usedepth) { this->verbose_out = verbose_out; @@ -81,8 +83,8 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode // Call other init function init(memory_mode, loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, - splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, - false, max_depth, regularization_factor, regularization_usedepth); + splitrule, predict_all, sample_fraction_vector, alpha, minprop, poisson_tau, holdout, prediction_type, + num_random_splits, false, max_depth, regularization_factor, regularization_usedepth); if (prediction_mode) { loadFromFile(load_forest_filename); @@ -139,16 +141,17 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, bool keep_inbag, - std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, - uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { + std::vector& sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout, + PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, + const std::vector& regularization_factor, bool regularization_usedepth) { this->verbose_out = verbose_out; // Call other init function init(MEM_DOUBLE, std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, - predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth, - regularization_factor, regularization_usedepth); + predict_all, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type, num_random_splits, + order_snps, max_depth, regularization_factor, regularization_usedepth); // Set variables to be always considered for splitting if (!always_split_variable_names.empty()) { @@ -181,8 +184,9 @@ void Forest::init(MemoryMode memory_mode, std::unique_ptr input_data, uint uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, - double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, - uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { + double alpha, double minprop, double poisson_tau, bool holdout, PredictionType prediction_type, + uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, + bool regularization_usedepth) { // Initialize data with memmode this->data = std::move(input_data); @@ -223,6 +227,7 @@ void Forest::init(MemoryMode memory_mode, std::unique_ptr input_data, uint this->holdout = holdout; this->alpha = alpha; this->minprop = minprop; + this->poisson_tau = poisson_tau; this->prediction_type = prediction_type; this->num_random_splits = num_random_splits; this->max_depth = max_depth; @@ -477,7 +482,7 @@ void Forest::grow() { trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights, importance_mode, min_node_size, sample_with_replacement, memory_saving_splitting, - splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, + splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, poisson_tau, holdout, num_random_splits, max_depth, ®ularization_factor, regularization_usedepth, &split_varIDs_used); } diff --git a/src/Forest.h b/src/Forest.h index 485f74452..30a346884 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -47,7 +47,7 @@ class Forest { std::string status_variable_name, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, - bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, + double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); void initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, @@ -55,15 +55,16 @@ class Forest { const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, - bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, - PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, + bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, double poisson_tau, + bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); void init(MemoryMode memory_mode, std::unique_ptr input_data, uint mtry, std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, - double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, - bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); + double alpha, double minprop, double poisson_tau, bool holdout, PredictionType prediction_type, + uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, + bool regularization_usedepth); virtual void initInternal() = 0; // Grow or predict @@ -208,6 +209,9 @@ class Forest { // MAXSTAT splitrule double alpha; double minprop; + + // POISSON splitrule + double poisson_tau; // Multithreading uint num_threads; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 499d16890..6916ac779 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -8,8 +8,8 @@ using namespace Rcpp; // rangerCpp -Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth); -RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP) { +Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth); +RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP poisson_tauSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -46,6 +46,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::vector& >::type sample_fraction(sample_fractionSEXP); Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< double >::type minprop(minpropSEXP); + Rcpp::traits::input_parameter< double >::type poisson_tau(poisson_tauSEXP); Rcpp::traits::input_parameter< bool >::type holdout(holdoutSEXP); Rcpp::traits::input_parameter< uint >::type prediction_type_r(prediction_type_rSEXP); Rcpp::traits::input_parameter< uint >::type num_random_splits(num_random_splitsSEXP); @@ -59,7 +60,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::vector& >::type regularization_factor(regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type use_regularization_factor(use_regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type regularization_usedepth(regularization_usedepthSEXP); - rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)); + rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)); return rcpp_result_gen; END_RCPP } @@ -90,7 +91,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 46}, + {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 47}, {"_ranger_numSmaller", (DL_FUNC) &_ranger_numSmaller, 2}, {"_ranger_randomObsNode", (DL_FUNC) &_ranger_randomObsNode, 3}, {NULL, NULL, 0} diff --git a/src/Tree.cpp b/src/Tree.cpp index db423a4cf..545167433 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -17,31 +17,32 @@ namespace ranger { Tree::Tree() : - mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_weights(0), case_weights( - 0), manual_inbag(0), oob_sampleIDs(0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth( - false), split_varIDs_used(0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement( - true), sample_fraction(0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( - DEFAULT_MINPROP), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( - 0) { + mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_weights(0), + case_weights(0), manual_inbag(0), oob_sampleIDs(0), holdout(false), keep_inbag(false), data(0), + regularization_factor(0), regularization_usedepth(false), split_varIDs_used(0), variable_importance(0), + importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement(true), sample_fraction(0), + memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), + poisson_tau(DEFAULT_POISSON_TAU), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), + depth(0), last_left_nodeID(0) { } Tree::Tree(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values) : - mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_weights(0), case_weights( - 0), manual_inbag(0), split_varIDs(split_varIDs), split_values(split_values), child_nodeIDs(child_nodeIDs), oob_sampleIDs( - 0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), split_varIDs_used( - 0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement(true), sample_fraction( - 0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( - DEFAULT_MINPROP), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( - 0) { + mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_weights(0), + case_weights(0), manual_inbag(0), split_varIDs(split_varIDs), split_values(split_values), child_nodeIDs(child_nodeIDs), + oob_sampleIDs(0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), + split_varIDs_used(0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement(true), + sample_fraction(0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), + minprop(DEFAULT_MINPROP), poisson_tau(DEFAULT_POISSON_TAU), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), + max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID(0) { } void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, - double minprop, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, - bool regularization_usedepth, std::vector* split_varIDs_used) { + double minprop, double poisson_tau, bool holdout, uint num_random_splits, uint max_depth, + std::vector* regularization_factor, bool regularization_usedepth, std::vector* split_varIDs_used) { this->data = data; this->mtry = mtry; @@ -69,6 +70,7 @@ void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std: this->holdout = holdout; this->alpha = alpha; this->minprop = minprop; + this->poisson_tau = poisson_tau; this->num_random_splits = num_random_splits; this->max_depth = max_depth; this->regularization_factor = regularization_factor; @@ -89,7 +91,7 @@ void Tree::grow(std::vector* variable_importance) { this->variable_importance = variable_importance; -// Bootstrap, dependent if weighted or not and with or without replacement + // Bootstrap, dependent if weighted or not and with or without replacement if (!case_weights->empty()) { if (sample_with_replacement) { bootstrapWeighted(); @@ -154,7 +156,7 @@ void Tree::predict(const Data* prediction_data, bool oob_prediction) { prediction_terminal_nodeIDs.resize(num_samples_predict, 0); -// For each sample start in root, drop down the tree and return final value + // For each sample start in root, drop down the tree and return final value for (size_t i = 0; i < num_samples_predict; ++i) { size_t sample_idx; if (oob_prediction) { diff --git a/src/Tree.h b/src/Tree.h index cb409b39c..d70362862 100644 --- a/src/Tree.h +++ b/src/Tree.h @@ -39,8 +39,8 @@ class Tree { std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, - std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, - uint max_depth, std::vector* regularization_factor, bool regularization_usedepth, + std::vector* sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout, + uint num_random_splits, uint max_depth, std::vector* regularization_factor, bool regularization_usedepth, std::vector* split_varIDs_used); virtual void allocateMemory() = 0; @@ -223,6 +223,7 @@ class Tree { SplitRule splitrule; double alpha; double minprop; + double poisson_tau; uint num_random_splits; uint max_depth; uint depth; diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index 6855a867e..95a986a39 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -42,15 +42,47 @@ void TreeRegression::allocateMemory() { } double TreeRegression::estimate(size_t nodeID) { - -// Mean of responses of samples in node - double sum_responses_in_node = 0; + + // Mean of responses of samples in node + double sum_responses_in_node = sumNodeResponse(nodeID); size_t num_samples_in_node = end_pos[nodeID] - start_pos[nodeID]; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_responses_in_node += data->get_y(sampleID, 0); + if (splitrule == POISSON && sum_responses_in_node == 0.) { + // Poisson is not allowed to predict 0. + // We use a weighted average of parent and child mean values, + // see vignette "Introduction to Rpart" Chapter 8.2 and + // https://ssrn.com/abstract=2870308 Chapter 6.1.3 + + // Search for parent's nodeID: loop over all nodeIDs + size_t parent_nodeID = 0; + bool found = false; + // Loop over left child nodes + for(std::size_t i = 0; i < child_nodeIDs[0].size(); ++i) { + // Break if parent node found + if (child_nodeIDs[0][i] == nodeID) { + parent_nodeID = i; + found = true; + } + } + if (!found) { + // Loop over right child nodes + for(std::size_t i = 0; i < child_nodeIDs[1].size(); ++i) { + // Break if parent node found + if (child_nodeIDs[1][i] == nodeID) { + parent_nodeID = i; + found = true; + } + } + } + + double sum_responses_in_parent = sumNodeResponse(parent_nodeID); + size_t num_samples_in_parent = end_pos[parent_nodeID] - start_pos[parent_nodeID]; + double mean_node = (sum_responses_in_node / (double) num_samples_in_node); + double mean_parent = (sum_responses_in_parent / (double) num_samples_in_parent); + double alpha = num_samples_in_node * mean_parent/(num_samples_in_node * mean_parent + poisson_tau); + return alpha * mean_node + (1 - alpha) * mean_parent; + } else { + return (sum_responses_in_node / (double) num_samples_in_node); } - return (sum_responses_in_node / (double) num_samples_in_node); } void TreeRegression::appendToFileInternal(std::ofstream& file) { // #nocov start @@ -80,7 +112,11 @@ bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possi pure_value = value; } if (pure) { - split_values[nodeID] = pure_value; + if (splitrule == POISSON && pure_value == 0.) { + split_values[nodeID] = estimate(nodeID); + } else { + split_values[nodeID] = pure_value; + } return true; } @@ -92,6 +128,8 @@ bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possi stop = findBestSplitExtraTrees(nodeID, possible_split_varIDs); } else if (splitrule == BETA) { stop = findBestSplitBeta(nodeID, possible_split_varIDs); + } else if (splitrule == POISSON) { + stop = findBestSplitPoisson(nodeID, possible_split_varIDs); } else { stop = findBestSplit(nodeID, possible_split_varIDs); } @@ -135,11 +173,7 @@ bool TreeRegression::findBestSplit(size_t nodeID, std::vector& possible_ double best_value = 0; // Compute sum of responses in node - double sum_node = 0; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_node += data->get_y(sampleID, 0); - } + double sum_node = sumNodeResponse(nodeID); // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -504,11 +538,7 @@ bool TreeRegression::findBestSplitExtraTrees(size_t nodeID, std::vector& double best_value = 0; // Compute sum of responses in node - double sum_node = 0; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_node += data->get_y(sampleID, 0); - } + double sum_node = sumNodeResponse(nodeID); // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -730,11 +760,7 @@ bool TreeRegression::findBestSplitBeta(size_t nodeID, std::vector& possi double best_value = 0; // Compute sum of responses in node - double sum_node = 0; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_node += data->get_y(sampleID, 0); - } + double sum_node = sumNodeResponse(nodeID); // For all possible split variables find best split value for (auto& varID : possible_split_varIDs) { @@ -892,18 +918,146 @@ void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double } } +bool TreeRegression::findBestSplitPoisson(size_t nodeID, std::vector& possible_split_varIDs) { + + size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; + double best_decrease = -std::numeric_limits::infinity(); + size_t best_varID = 0; + double best_value = 0; + + // Compute sum of responses in node + double sum_node = sumNodeResponse(nodeID); + + // For all possible split variables find best split value + for (auto& varID : possible_split_varIDs) { + findBestSplitValuePoisson(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); + } + + // Stop if no good split found + if (std::isinf(-best_decrease)) { + return true; + } + + // Save best values + split_varIDs[nodeID] = best_varID; + split_values[nodeID] = best_value; + + // Compute decrease of impurity for this node and add to variable importance if needed + if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { + addImpurityImportance(nodeID, best_varID, best_decrease); + } + + // Regularization + saveSplitVarID(best_varID); + + return false; +} + +void TreeRegression::findBestSplitValuePoisson(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, + double& best_value, size_t& best_varID, double& best_decrease) { + + // Create possible split values + std::vector possible_split_values; + data->getAllValues(possible_split_values, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); + + // Try next variable if all equal for this + if (possible_split_values.size() < 2) { + return; + } + + // -1 because no split possible at largest value + const size_t num_splits = possible_split_values.size() - 1; + if (memory_saving_splitting) { + std::vector sums_right(num_splits); + std::vector n_right(num_splits); + findBestSplitValuePoisson(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, + possible_split_values, sums_right, n_right); + } else { + std::fill_n(sums.begin(), num_splits, 0); + std::fill_n(counter.begin(), num_splits, 0); + findBestSplitValuePoisson(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, + possible_split_values, sums, counter); + } +} + +void TreeRegression::findBestSplitValuePoisson(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, + double& best_value, size_t& best_varID, double& best_decrease, + std::vector possible_split_values, + std::vector& sums_right, std::vector& n_right) { + // -1 because no split possible at largest value + const size_t num_splits = possible_split_values.size() - 1; + + // Sum in right child and possbile split + for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { + size_t sampleID = sampleIDs[pos]; + double value = data->get_x(sampleID, varID); + double response = data->get_y(sampleID, 0); + + // Count samples until split_value reached + for (size_t i = 0; i < num_splits; ++i) { + if (value > possible_split_values[i]) { + ++n_right[i]; + sums_right[i] += response; + } else { + break; + } + } + } + + // Compute Poisson deviance for each possible split + for (size_t i = 0; i < num_splits; ++i) { + + // Stop if right child empty + size_t n_left = num_samples_node - n_right[i]; + if (n_right[i] == 0) { + continue; + } + + // Compute mean + double sum_right = sums_right[i]; + double mean_right = sum_right / (double) n_right[i]; + double sum_left = sum_node - sum_right; + double mean_left = sum_left / (double) n_left; + + // Poisson deviance = 2 * (y_true * log(y_true/y_pred) + y_pred - y_true) + // decrease = - 1/2 * (sum_left(poisson_deviance) + sum_right(poisson_deviance)) + // = + sum_left(y) * log(mean_left) + sum_right(y) * log(mean_right) + const + 0 + // The smaller the deviance, the better => the larger the decrease, the better. + double decrease = xlogy(sum_left, mean_left) + xlogy(sum_right, mean_right); + + // Stop if no result + if (std::isnan(decrease)) { + continue; + } + + // Regularization + if (decrease > 0) { + regularize(decrease, varID); + } else { + regularizeNegative(decrease, varID); + } + + // If better than before, use this + if (decrease > best_decrease) { + best_value = (possible_split_values[i] + possible_split_values[i + 1]) / 2; + best_varID = varID; + best_decrease = decrease; + + // Use smaller value if average is numerically the same as the larger value + if (best_value == possible_split_values[i + 1]) { + best_value = possible_split_values[i]; + } + } + } +} + void TreeRegression::addImpurityImportance(size_t nodeID, size_t varID, double decrease) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; double best_decrease = decrease; if (splitrule != MAXSTAT) { - double sum_node = 0; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_node += data->get_y(sampleID, 0); - } - + double sum_node = sumNodeResponse(nodeID); double impurity_node = (sum_node * sum_node / (double) num_samples_node); // Account for the regularization diff --git a/src/TreeRegression.h b/src/TreeRegression.h index 84c224f63..77dec69f7 100644 --- a/src/TreeRegression.h +++ b/src/TreeRegression.h @@ -82,10 +82,26 @@ class TreeRegression: public Tree { void findBestSplitValueBeta(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, std::vector& sums_right, std::vector& n_right); + + bool findBestSplitPoisson(size_t nodeID, std::vector& possible_split_varIDs); + void findBestSplitValuePoisson(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, + size_t& best_varID, double& best_decrease); + void findBestSplitValuePoisson(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, + size_t& best_varID, double& best_decrease, std::vector possible_split_values, + std::vector& sums_right, std::vector& n_right); void addImpurityImportance(size_t nodeID, size_t varID, double decrease); double computePredictionMSE(); + + // Compute sum of responses in node. As in-class definition, this is inline by default. + double sumNodeResponse(size_t nodeID) { + double sum_node = 0; + for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { + sum_node += data->get_y(sampleIDs[pos], 0); + } + return sum_node; + } void cleanUpInternal() override { counter.clear(); diff --git a/src/globals.h b/src/globals.h index b794273be..26c6f6aae 100644 --- a/src/globals.h +++ b/src/globals.h @@ -69,7 +69,8 @@ enum SplitRule { MAXSTAT = 4, EXTRATREES = 5, BETA = 6, - HELLINGER = 7 + HELLINGER = 7, + POISSON = 8 }; // Prediction type @@ -92,6 +93,8 @@ const SplitRule DEFAULT_SPLITRULE = LOGRANK; const double DEFAULT_ALPHA = 0.5; const double DEFAULT_MINPROP = 0.1; +const double DEFAULT_POISSON_TAU = 1; + const uint DEFAULT_MAXDEPTH = 0; const PredictionType DEFAULT_PREDICTIONTYPE = RESPONSE; const uint DEFAULT_NUM_RANDOM_SPLITS = 1; diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index 991fc9609..f3c1169ef 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -57,8 +57,8 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, - std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, - uint num_random_splits, Eigen::SparseMatrix& sparse_x, + std::vector& sample_fraction, double alpha, double minprop, double poisson_tau, + bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth) { @@ -151,7 +151,8 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM forest->initR(std::move(data), mtry, num_trees, verbose_out, seed, num_threads, importance_mode, min_node_size, split_select_weights, always_split_variable_names, prediction_mode, sample_with_replacement, unordered_variable_names, save_memory, splitrule, case_weights, - inbag, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, + inbag, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, + holdout, prediction_type, num_random_splits, order_snps, max_depth, regularization_factor, regularization_usedepth); // Load forest object if in prediction mode diff --git a/src/utility.h b/src/utility.h index ac809b1a6..4761bc3f3 100644 --- a/src/utility.h +++ b/src/utility.h @@ -12,6 +12,7 @@ #ifndef UTILITY_H_ #define UTILITY_H_ +#include #include #include #include @@ -525,6 +526,20 @@ std::stringstream& readFromStream(std::stringstream& in, double& token); */ double betaLogLik(double y, double mean, double phi); +/** + * Compute x * log(y) with 0 * log(..) equal to 0. + * @param x + * @param y + * @return x * log(y) + */ +inline double xlogy(double x, double y) { + if (x == 0) { + return 0; + } else { + return x * log(y); + } +} + // User interrupt from R #ifdef R_BUILD static void chkIntFn(void *dummy) { diff --git a/tests/testthat/test_poissonsplit.R b/tests/testthat/test_poissonsplit.R new file mode 100644 index 000000000..04d8ffa90 --- /dev/null +++ b/tests/testthat/test_poissonsplit.R @@ -0,0 +1,66 @@ +library(ranger) + +# Generate poisson distributed outcome +set.seed(42) +n <- 1000 +p <- 4 +beta <- c(0, 0.1, -0.2, 0.3) +x <- replicate(p, runif(n)) +# Use exp(..) to keep it positive (and adds interactions). +# Add -1 to make it small as Poisson should be better for small frequencies. +lambda <- exp(-1 + as.vector(x %*% beta)) +y <- rpois(n, lambda) +df <- data.frame(y = y, x) + +# And a simple dataset with zero outcomes +df2 <- data.frame(y = c(0, 0, 0, 0, 0, 1, 2, 3, 4, 5), + x1 = c("a", "a", "a", "a", "a", "b", "b", "b", "b", "b"), + x2 = c(0, 0, 0, 0, 0, 1, 1, 1, 2, 2)) + +poisson_deviance <- function(y_true, y_pred) { + if (any(y_true == 0 & y_pred == 0)) { + stop("Error: Poisson deviance does not exist for y_pred == y_true == 0.") + } + pos <- y_true > 0 + dev <- y_pred + dev[pos] <- y_true[pos] * log(y_true[pos] / y_pred[pos]) - y_true[pos] + y_pred[pos] + return(2 * mean(dev)) +} + + +test_that("poisson splitting works on poisson distributed data", { + n_train = 1:(4*n %/% 5) + n_test = (max(n_train)+1):n + df_train = df[n_train, ] + df_test = df[n_test, ] + rf_poi <- ranger(y ~ ., df_train, splitrule = "poisson", num.trees = 50, min.node.size = 50, poisson.tau = 1, seed = 12) + rf_mse <- ranger(y ~ ., df_train, splitrule = "variance", num.trees = 50, min.node.size = 50, seed = 13) + + expect_is(rf_poi, "ranger") + # deviance on test set + expect_lt(poisson_deviance(df_test$y, predict(rf_poi, df_test)$predictions), + poisson_deviance(df_test$y, predict(rf_mse, df_test)$predictions)) +}) + +test_that("poisson splitting not working for negative outcome", { + expect_error(ranger(y ~ ., data.frame(y = c(-1.5, 2), x = c(1, 2)), splitrule = "poisson")) + expect_error(ranger(y ~ ., data.frame(y = c(0, 0), x = c(1, 2)), splitrule = "poisson")) +}) + +test_that("poisson.tau <= 0 throws error", { + expect_error(ranger(y ~ ., df2, poisson.tau = 0)) +}) + +test_that("poisson splitting predicts positive even on nodes with all values equal 0", { + rf <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 0.1, mtry = 2, num.trees = 2, + min.node.size = 1, seed = 123) + expect_true(all(c(predict(rf, df2, predict.all = TRUE)$predictions) > 0)) +}) + +test_that("poisson splitting gives larger predictions for larger values of poisson.tau on pure nodes with y = 0", { + rf1 <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 0.1, mtry = 2, num.trees = 2, + min.node.size = 1, seed = 123) + rf2 <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 10, mtry = 2, num.trees = 2, + min.node.size = 1, seed = 123) + expect_true(all(predict(rf2, df2)$predictions[df2$y == 0] > predict(rf1, df2)$predictions[df2$y == 0])) +}) diff --git a/tests/testthat/test_quantreg.R b/tests/testthat/test_quantreg.R index b7f4481e6..8eed5ce58 100644 --- a/tests/testthat/test_quantreg.R +++ b/tests/testthat/test_quantreg.R @@ -2,7 +2,7 @@ library(ranger) context("ranger_quantreg") rf.quant <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE, - keep.inbag = TRUE, num.trees = 50) + keep.inbag = TRUE, num.trees = 100, seed = 0) pred.quant <- predict(rf.quant, mtcars[27:32, ], type = "quantiles") test_that("Quantile prediction is of correct size", { From 1a975f09aab015a9c2dcaff481c807701e8ff822 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 22 Mar 2020 16:29:25 +0100 Subject: [PATCH 2/9] ENH check in forest with poisson splitrule for valid range of y --- src/ForestRegression.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/ForestRegression.cpp b/src/ForestRegression.cpp index f14035773..7522f6f33 100644 --- a/src/ForestRegression.cpp +++ b/src/ForestRegression.cpp @@ -61,6 +61,21 @@ void ForestRegression::initInternal() { } } } + + // Error if poisson splitrule used with negative data + if (splitrule == POISSON && !prediction_mode) { + double y_sum = 0; + for (size_t i = 0; i < num_samples; ++i) { + double y = data->get_y(i, 0); + y_sum += y; + if (y < 0) { + throw std::runtime_error("Poisson splitrule applicable to regression data with non-positive outcome (y>=0 and sum(y)>0) only."); + } + } + if (y_sum <= 0) { + throw std::runtime_error("Poisson splitrule applicable to regression data with non-positive outcome (y>=0 and sum(y)>0) only."); + } + } // Sort data if memory saving mode if (!memory_saving_splitting) { From cb430899ad4a31379476cdfe322b008926453690 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 22 Mar 2020 16:39:02 +0100 Subject: [PATCH 3/9] MNT indend comments correctly --- src/Forest.cpp | 34 +++++++++++++------------- src/ForestRegression.cpp | 18 +++++++------- src/Tree.cpp | 52 ++++++++++++++++++++-------------------- src/TreeRegression.cpp | 22 ++++++++--------- 4 files changed, 63 insertions(+), 63 deletions(-) diff --git a/src/Forest.cpp b/src/Forest.cpp index 4495495c6..b05545540 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -486,10 +486,10 @@ void Forest::grow() { num_random_splits, max_depth, ®ularization_factor, regularization_usedepth, &split_varIDs_used); } -// Init variable importance + // Init variable importance variable_importance.resize(num_independent_variables, 0); -// Grow trees in multiple threads + // Grow trees in multiple threads #ifdef OLD_WIN_R_BUILD // #nocov start progress = 0; @@ -511,7 +511,7 @@ void Forest::grow() { std::vector threads; threads.reserve(num_threads); -// Initialize importance per thread + // Initialize importance per thread std::vector> variable_importance_threads(num_threads); for (uint i = 0; i < num_threads; ++i) { @@ -544,7 +544,7 @@ void Forest::grow() { #endif -// Divide importance by number of trees + // Divide importance by number of trees if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { for (auto& v : variable_importance) { v /= num_trees; @@ -554,7 +554,7 @@ void Forest::grow() { void Forest::predict() { -// Predict trees in multiple threads and join the threads with the main thread + // Predict trees in multiple threads and join the threads with the main thread #ifdef OLD_WIN_R_BUILD // #nocov start progress = 0; @@ -650,14 +650,14 @@ void Forest::computePredictionError() { void Forest::computePermutationImportance() { -// Compute tree permutation importance in multiple threads + // Compute tree permutation importance in multiple threads #ifdef OLD_WIN_R_BUILD // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); -// Initialize importance and variance + // Initialize importance and variance variable_importance.resize(num_independent_variables, 0); std::vector variance; if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { @@ -667,7 +667,7 @@ void Forest::computePermutationImportance() { variable_importance_casewise.resize(num_independent_variables * num_samples, 0); } -// Compute importance + // Compute importance for (size_t i = 0; i < num_trees; ++i) { trees[i]->computePermutationImportance(variable_importance, variance, variable_importance_casewise); progress++; @@ -684,12 +684,12 @@ void Forest::computePermutationImportance() { std::vector threads; threads.reserve(num_threads); -// Initialize importance and variance + // Initialize importance and variance std::vector> variable_importance_threads(num_threads); std::vector> variance_threads(num_threads); std::vector> variable_importance_casewise_threads(num_threads); -// Compute importance + // Compute importance for (uint i = 0; i < num_threads; ++i) { variable_importance_threads[i].resize(num_independent_variables, 0); if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { @@ -716,7 +716,7 @@ void Forest::computePermutationImportance() { } #endif -// Sum thread importances + // Sum thread importances variable_importance.resize(num_independent_variables, 0); for (size_t i = 0; i < num_independent_variables; ++i) { for (uint j = 0; j < num_threads; ++j) { @@ -725,7 +725,7 @@ void Forest::computePermutationImportance() { } variable_importance_threads.clear(); -// Sum thread variances + // Sum thread variances std::vector variance(num_independent_variables, 0); if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { for (size_t i = 0; i < num_independent_variables; ++i) { @@ -736,7 +736,7 @@ void Forest::computePermutationImportance() { variance_threads.clear(); } -// Sum thread casewise importances + // Sum thread casewise importances if (importance_mode == IMP_PERM_CASEWISE) { variable_importance_casewise.resize(num_independent_variables * num_samples, 0); for (size_t i = 0; i < variable_importance_casewise.size(); ++i) { @@ -954,12 +954,12 @@ std::unique_ptr Forest::loadDataFromFile(const std::string& data_path) { void Forest::setSplitWeightVector(std::vector>& split_select_weights) { -// Size should be 1 x num_independent_variables or num_trees x num_independent_variables + // Size should be 1 x num_independent_variables or num_trees x num_independent_variables if (split_select_weights.size() != 1 && split_select_weights.size() != num_trees) { throw std::runtime_error("Size of split select weights not equal to 1 or number of trees."); } -// Reserve space + // Reserve space size_t num_weights = num_independent_variables; if (importance_mode == IMP_GINI_CORRECTED) { num_weights = 2 * num_independent_variables; @@ -1032,7 +1032,7 @@ void Forest::setAlwaysSplitVariables(const std::vector& always_spli // #nocov start void Forest::showProgress(std::string operation, clock_t start_time, clock_t& lap_time) { -// Check for user interrupt + // Check for user interrupt if (checkInterrupt()) { throw std::runtime_error("User interrupt."); } @@ -1060,7 +1060,7 @@ void Forest::showProgress(std::string operation, size_t max_progress) { steady_clock::time_point last_time = steady_clock::now(); std::unique_lock lock(mutex); -// Wait for message from threads and show output if enough time elapsed + // Wait for message from threads and show output if enough time elapsed while (progress < max_progress) { condition_variable.wait(lock); seconds elapsed_time = duration_cast(steady_clock::now() - last_time); diff --git a/src/ForestRegression.cpp b/src/ForestRegression.cpp index 7522f6f33..40ebca2c3 100644 --- a/src/ForestRegression.cpp +++ b/src/ForestRegression.cpp @@ -123,7 +123,7 @@ void ForestRegression::predictInternal(size_t sample_idx) { void ForestRegression::computePredictionErrorInternal() { -// For each sample sum over trees where sample is OOB + // For each sample sum over trees where sample is OOB std::vector samples_oob_count; predictions = std::vector>>(1, std::vector>(1, std::vector(num_samples, 0))); @@ -138,7 +138,7 @@ void ForestRegression::computePredictionErrorInternal() { } } -// MSE with predictions and true data + // MSE with predictions and true data size_t num_predictions = 0; overall_prediction_error = 0; for (size_t i = 0; i < predictions[0][0].size(); ++i) { @@ -165,7 +165,7 @@ void ForestRegression::writeOutputInternal() { void ForestRegression::writeConfusionFile() { -// Open confusion file for writing + // Open confusion file for writing std::string filename = output_prefix + ".confusion"; std::ofstream outfile; outfile.open(filename, std::ios::out); @@ -173,7 +173,7 @@ void ForestRegression::writeConfusionFile() { throw std::runtime_error("Could not write to confusion file: " + filename + "."); } -// Write confusion to file + // Write confusion to file outfile << "Overall OOB prediction error (MSE): " << overall_prediction_error << std::endl; outfile.close(); @@ -183,7 +183,7 @@ void ForestRegression::writeConfusionFile() { void ForestRegression::writePredictionFile() { -// Open prediction file for writing + // Open prediction file for writing std::string filename = output_prefix + ".prediction"; std::ofstream outfile; outfile.open(filename, std::ios::out); @@ -219,21 +219,21 @@ void ForestRegression::writePredictionFile() { void ForestRegression::saveToFileInternal(std::ofstream& outfile) { -// Write num_variables + // Write num_variables outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); -// Write treetype + // Write treetype TreeType treetype = TREE_REGRESSION; outfile.write((char*) &treetype, sizeof(treetype)); } void ForestRegression::loadFromFileInternal(std::ifstream& infile) { -// Read number of variables + // Read number of variables size_t num_variables_saved; infile.read((char*) &num_variables_saved, sizeof(num_variables_saved)); -// Read treetype + // Read treetype TreeType treetype; infile.read((char*) &treetype, sizeof(treetype)); if (treetype != TREE_REGRESSION) { diff --git a/src/Tree.cpp b/src/Tree.cpp index 545167433..5f441a327 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -208,7 +208,7 @@ void Tree::computePermutationImportance(std::vector& forest_importance, size_t num_independent_variables = data->getNumCols(); -// Compute normal prediction accuracy for each tree. Predictions already computed.. + // Compute normal prediction accuracy for each tree. Predictions already computed.. double accuracy_normal; std::vector prederr_normal_casewise; std::vector prederr_shuf_casewise; @@ -223,10 +223,10 @@ void Tree::computePermutationImportance(std::vector& forest_importance, prediction_terminal_nodeIDs.clear(); prediction_terminal_nodeIDs.resize(num_samples_oob, 0); -// Reserve space for permutations, initialize with oob_sampleIDs + // Reserve space for permutations, initialize with oob_sampleIDs std::vector permutations(oob_sampleIDs); -// Randomly permute for all independent variables + // Randomly permute for all independent variables for (size_t i = 0; i < num_independent_variables; ++i) { // Permute and compute prediction accuracy again for this permutation and save difference @@ -257,12 +257,12 @@ void Tree::computePermutationImportance(std::vector& forest_importance, // #nocov start void Tree::appendToFile(std::ofstream& file) { -// Save general fields + // Save general fields saveVector2D(child_nodeIDs, file); saveVector1D(split_varIDs, file); saveVector1D(split_values, file); -// Call special functions for subclasses to save special fields. + // Call special functions for subclasses to save special fields. appendToFileInternal(file); } // #nocov end @@ -378,7 +378,7 @@ void Tree::createEmptyNode() { size_t Tree::dropDownSamplePermuted(size_t permuted_varID, size_t sampleID, size_t permuted_sampleID) { -// Start in root and drop down + // Start in root and drop down size_t nodeID = 0; while (child_nodeIDs[0][nodeID] != 0 || child_nodeIDs[1][nodeID] != 0) { @@ -419,11 +419,11 @@ size_t Tree::dropDownSamplePermuted(size_t permuted_varID, size_t sampleID, size void Tree::permuteAndPredictOobSamples(size_t permuted_varID, std::vector& permutations) { -// Permute OOB sample -//std::vector permutations(oob_sampleIDs); + // Permute OOB sample + //std::vector permutations(oob_sampleIDs); std::shuffle(permutations.begin(), permutations.end(), random_number_generator); -// For each sample, drop down the tree and add prediction + // For each sample, drop down the tree and add prediction for (size_t i = 0; i < num_samples_oob; ++i) { size_t nodeID = dropDownSamplePermuted(permuted_varID, oob_sampleIDs[i], permutations[i]); prediction_terminal_nodeIDs[i] = nodeID; @@ -432,26 +432,26 @@ void Tree::permuteAndPredictOobSamples(size_t permuted_varID, std::vector unif_dist(0, num_samples - 1); -// Start with all samples OOB + // Start with all samples OOB inbag_counts.resize(num_samples, 0); -// Draw num_samples samples with replacement (num_samples_inbag out of n) as inbag and mark as not OOB + // Draw num_samples samples with replacement (num_samples_inbag out of n) as inbag and mark as not OOB for (size_t s = 0; s < num_samples_inbag; ++s) { size_t draw = unif_dist(random_number_generator); sampleIDs.push_back(draw); ++inbag_counts[draw]; } -// Save OOB samples + // Save OOB samples for (size_t s = 0; s < inbag_counts.size(); ++s) { if (inbag_counts[s] == 0) { oob_sampleIDs.push_back(s); @@ -467,25 +467,25 @@ void Tree::bootstrap() { void Tree::bootstrapWeighted() { -// Use fraction (default 63.21%) of the samples + // Use fraction (default 63.21%) of the samples size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0]; -// Reserve space, reserve a little more to be save) + // Reserve space, reserve a little more to be save) sampleIDs.reserve(num_samples_inbag); oob_sampleIDs.reserve(num_samples * (exp(-(*sample_fraction)[0]) + 0.1)); - + std::discrete_distribution<> weighted_dist(case_weights->begin(), case_weights->end()); - -// Start with all samples OOB + + // Start with all samples OOB inbag_counts.resize(num_samples, 0); - -// Draw num_samples samples with replacement (n out of n) as inbag and mark as not OOB + + // Draw num_samples samples with replacement (n out of n) as inbag and mark as not OOB for (size_t s = 0; s < num_samples_inbag; ++s) { size_t draw = weighted_dist(random_number_generator); sampleIDs.push_back(draw); ++inbag_counts[draw]; } - + // Save OOB samples. In holdout mode these are the cases with 0 weight. if (holdout) { for (size_t s = 0; s < (*case_weights).size(); ++s) { @@ -510,7 +510,7 @@ void Tree::bootstrapWeighted() { void Tree::bootstrapWithoutReplacement() { -// Use fraction (default 63.21%) of the samples + // Use fraction (default 63.21%) of the samples size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0]; shuffleAndSplit(sampleIDs, oob_sampleIDs, num_samples, num_samples_inbag, random_number_generator); num_samples_oob = oob_sampleIDs.size(); @@ -526,17 +526,17 @@ void Tree::bootstrapWithoutReplacement() { void Tree::bootstrapWithoutReplacementWeighted() { -// Use fraction (default 63.21%) of the samples + // Use fraction (default 63.21%) of the samples size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0]; drawWithoutReplacementWeighted(sampleIDs, random_number_generator, num_samples - 1, num_samples_inbag, *case_weights); -// All observation are 0 or 1 times inbag + // All observation are 0 or 1 times inbag inbag_counts.resize(num_samples, 0); for (auto& sampleID : sampleIDs) { inbag_counts[sampleID] = 1; } -// Save OOB samples. In holdout mode these are the cases with 0 weight. + // Save OOB samples. In holdout mode these are the cases with 0 weight. if (holdout) { for (size_t s = 0; s < (*case_weights).size(); ++s) { if ((*case_weights)[s] == 0) { diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index 95a986a39..eb95548cd 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -86,7 +86,7 @@ double TreeRegression::estimate(size_t nodeID) { } void TreeRegression::appendToFileInternal(std::ofstream& file) { // #nocov start -// Empty on purpose + // Empty on purpose } // #nocov end bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { @@ -143,7 +143,7 @@ bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possi } void TreeRegression::createEmptyNodeInternal() { -// Empty on purpose + // Empty on purpose } double TreeRegression::computePredictionAccuracyInternal(std::vector* prediction_error_casewise) { @@ -198,16 +198,16 @@ bool TreeRegression::findBestSplit(size_t nodeID, std::vector& possible_ } } -// Stop if no good split found + // Stop if no good split found if (best_decrease < 0) { return true; } -// Save best values + // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; -// Compute decrease of impurity for this node and add to variable importance if needed + // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { addImpurityImportance(nodeID, best_varID, best_decrease); } @@ -364,21 +364,21 @@ void TreeRegression::findBestSplitValueLargeQ(size_t nodeID, size_t varID, doubl void TreeRegression::findBestSplitValueUnordered(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { -// Create possible split values + // Create possible split values std::vector factor_levels; data->getAllValues(factor_levels, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); -// Try next variable if all equal for this + // Try next variable if all equal for this if (factor_levels.size() < 2) { return; } -// Number of possible splits is 2^num_levels + // Number of possible splits is 2^num_levels size_t num_splits = (1ULL << factor_levels.size()); -// Compute decrease of impurity for each possible split -// Split where all left (0) or all right (1) are excluded -// The second half of numbers is just left/right switched the first half -> Exclude second half + // Compute decrease of impurity for each possible split + // Split where all left (0) or all right (1) are excluded + // The second half of numbers is just left/right switched the first half -> Exclude second half for (size_t local_splitID = 1; local_splitID < num_splits / 2; ++local_splitID) { // Compute overall splitID by shifting local factorIDs to global positions From b593d655d053a539d1678a09dfc5ae16b3ae4029 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 25 Mar 2020 09:34:03 +0100 Subject: [PATCH 4/9] ENH address review comments for poisson splitrule --- R/ranger.R | 2 +- src/TreeRegression.cpp | 2 ++ tests/testthat/test_poissonsplit.R | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/R/ranger.R b/R/ranger.R index 80e1621f3..1ae80ff9b 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -105,7 +105,7 @@ ##' @param alpha For "maxstat" splitrule: Significance threshold to allow splitting. ##' @param minprop For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting. ##' @param poisson.tau For "poisson" splitrule: The coefficient of variation of the (expected) frequency is \eqn{1/\tau}. -##' If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parend) / (\tau + samples(child) mean(parend))}. +##' If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parent) / (\tau + samples(child) mean(parent))}. ##' @param split.select.weights Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used. ##' @param always.split.variables Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting. ##' @param respect.unordered.factors Handling of unordered factor covariates. One of 'ignore', 'order' and 'partition'. For the "extratrees" splitrule the default is "partition" for all other splitrules 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used. See below for details. diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index 3933bd0a2..85bcb70da 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -62,6 +62,7 @@ double TreeRegression::estimate(size_t nodeID) { if (child_nodeIDs[0][i] == nodeID) { parent_nodeID = i; found = true; + break; } } if (!found) { @@ -71,6 +72,7 @@ double TreeRegression::estimate(size_t nodeID) { if (child_nodeIDs[1][i] == nodeID) { parent_nodeID = i; found = true; + break; } } } diff --git a/tests/testthat/test_poissonsplit.R b/tests/testthat/test_poissonsplit.R index 04d8ffa90..2640d0b48 100644 --- a/tests/testthat/test_poissonsplit.R +++ b/tests/testthat/test_poissonsplit.R @@ -1,4 +1,5 @@ library(ranger) +context("ranger_poisson") # Generate poisson distributed outcome set.seed(42) From d4f732a6b6037b6811e11e683ddfe00f4251829e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 25 Mar 2020 19:07:50 +0100 Subject: [PATCH 5/9] ENH make Poisson splitrule analogous to findBestSplitValueSmallQ --- man/ranger.Rd | 2 +- src/TreeRegression.cpp | 65 +++++++++++++++++++++--------------------- src/TreeRegression.h | 10 +++---- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/man/ranger.Rd b/man/ranger.Rd index 7e73682cb..de8e4eb00 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -84,7 +84,7 @@ For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank".} \item{minprop}{For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting.} \item{poisson.tau}{For "poisson" splitrule: The coefficient of variation of the (expected) frequency is \eqn{1/\tau}. -If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parend) / (\tau + samples(child) mean(parend))}.} +If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parent) / (\tau + samples(child) mean(parent))}.} \item{split.select.weights}{Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.} diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index 85bcb70da..b4f08e2e3 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -933,7 +933,8 @@ bool TreeRegression::findBestSplitPoisson(size_t nodeID, std::vector& po // For all possible split variables find best split value for (auto& varID : possible_split_varIDs) { - findBestSplitValuePoisson(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); + findBestSplitValuePoissonSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, + best_decrease); } // Stop if no good split found @@ -956,8 +957,8 @@ bool TreeRegression::findBestSplitPoisson(size_t nodeID, std::vector& po return false; } -void TreeRegression::findBestSplitValuePoisson(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, - double& best_value, size_t& best_varID, double& best_decrease) { +void TreeRegression::findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID, double sum_node, + size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { // Create possible split values std::vector possible_split_values; @@ -973,53 +974,53 @@ void TreeRegression::findBestSplitValuePoisson(size_t nodeID, size_t varID, doub if (memory_saving_splitting) { std::vector sums_right(num_splits); std::vector n_right(num_splits); - findBestSplitValuePoisson(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, + findBestSplitValuePoissonSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, sums_right, n_right); } else { std::fill_n(sums.begin(), num_splits, 0); std::fill_n(counter.begin(), num_splits, 0); - findBestSplitValuePoisson(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, + findBestSplitValuePoissonSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, possible_split_values, sums, counter); } } -void TreeRegression::findBestSplitValuePoisson(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, - double& best_value, size_t& best_varID, double& best_decrease, - std::vector possible_split_values, - std::vector& sums_right, std::vector& n_right) { - // -1 because no split possible at largest value - const size_t num_splits = possible_split_values.size() - 1; - - // Sum in right child and possbile split +void TreeRegression::findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, + double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, + std::vector& sums, std::vector& counter) { + + // Sum and sample count for possbile splits for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get_x(sampleID, varID); - double response = data->get_y(sampleID, 0); + size_t idx = std::lower_bound(possible_split_values.begin(), possible_split_values.end(), + data->get_x(sampleID, varID)) - possible_split_values.begin(); - // Count samples until split_value reached - for (size_t i = 0; i < num_splits; ++i) { - if (value > possible_split_values[i]) { - ++n_right[i]; - sums_right[i] += response; - } else { - break; - } - } + sums[idx] += data->get_y(sampleID, 0); + ++counter[idx]; } - // Compute Poisson deviance for each possible split - for (size_t i = 0; i < num_splits; ++i) { + size_t n_left = 0; + double sum_left = 0; + + // Compute decrease in Poisson deviance for each possible split + for (size_t i = 0; i < possible_split_values.size() - 1; ++i) { - // Stop if right child empty - size_t n_left = num_samples_node - n_right[i]; - if (n_right[i] == 0) { + // Stop if nothing here + if (counter[i] == 0) { continue; } + n_left += counter[i]; + sum_left += sums[i]; + + // Stop if right child empty + size_t n_right = num_samples_node - n_left; + if (n_right == 0) { + break; + } + // Compute mean - double sum_right = sums_right[i]; - double mean_right = sum_right / (double) n_right[i]; - double sum_left = sum_node - sum_right; + double sum_right = sum_node - sum_left; + double mean_right = sum_right / (double) n_right; double mean_left = sum_left / (double) n_left; // Poisson deviance = 2 * (y_true * log(y_true/y_pred) + y_pred - y_true) diff --git a/src/TreeRegression.h b/src/TreeRegression.h index 77dec69f7..b1352f7d4 100644 --- a/src/TreeRegression.h +++ b/src/TreeRegression.h @@ -84,11 +84,11 @@ class TreeRegression: public Tree { std::vector& sums_right, std::vector& n_right); bool findBestSplitPoisson(size_t nodeID, std::vector& possible_split_varIDs); - void findBestSplitValuePoisson(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, - size_t& best_varID, double& best_decrease); - void findBestSplitValuePoisson(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, - size_t& best_varID, double& best_decrease, std::vector possible_split_values, - std::vector& sums_right, std::vector& n_right); + void findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, + double& best_value, size_t& best_varID, double& best_decrease); + void findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, + double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, + std::vector& sums, std::vector& counter); void addImpurityImportance(size_t nodeID, size_t varID, double decrease); From 4ac8f5715fcee6104d5b7735fbe6d893ff53769b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 16 May 2024 13:48:46 +0200 Subject: [PATCH 6/9] forgot some merge conflicts... --- src/Forest.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/Forest.cpp b/src/Forest.cpp index 3bf6a46aa..f4999015b 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -82,13 +82,8 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode // Call other init function init(loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, -<<<<<<< HEAD - min_node_size, min_bucket, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, - splitrule, predict_all, sample_fraction_vector, alpha, minprop, poisson_tau, holdout, prediction_type, num_random_splits, -======= min_node_size_vector, min_bucket_vector, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, - splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, ->>>>>>> master + splitrule, predict_all, sample_fraction_vector, alpha, minprop, poisson_tau, holdout, prediction_type, num_random_splits, false, max_depth, regularization_factor, regularization_usedepth, false); if (prediction_mode) { @@ -482,13 +477,8 @@ void Forest::grow() { } trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights, -<<<<<<< HEAD - importance_mode, min_node_size, min_bucket, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, - tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, poisson_tau, holdout, num_random_splits, max_depth, -======= importance_mode, &min_node_size, &min_bucket, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, - tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, num_random_splits, max_depth, ->>>>>>> master + tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, poisson_tau, holdout, num_random_splits, max_depth, ®ularization_factor, regularization_usedepth, &split_varIDs_used, save_node_stats); } From ed2b73d59e96224f3eb016ecf31e6f3417cc0109 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 16 May 2024 14:28:58 +0200 Subject: [PATCH 7/9] min bucket for Poisson splitting --- src/TreeRegression.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index de4920fcb..08dd350ec 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -1001,10 +1001,14 @@ bool TreeRegression::findBestSplitPoisson(size_t nodeID, std::vector& po // Compute sum of responses in node double sum_node = sumNodeResponse(nodeID); - // For all possible split variables find best split value - for (auto& varID : possible_split_varIDs) { - findBestSplitValuePoissonSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, - best_decrease); + // Stop early if no split posssible + if (num_samples_node >= 2 * (*min_bucket)[0]) { + + // For all possible split variables find best split value + for (auto& varID : possible_split_varIDs) { + findBestSplitValuePoissonSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, + best_decrease); + } } // Stop if no good split found @@ -1088,6 +1092,11 @@ void TreeRegression::findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID break; } + // Stop if minimal bucket size reached + if (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0]) { + continue; + } + // Compute mean double sum_right = sum_node - sum_left; double mean_right = sum_right / (double) n_right; From 24bd1701d7646b67de659f254f0830e8ae106a4d Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 16 May 2024 16:23:26 +0200 Subject: [PATCH 8/9] add Poisson splitrule to pure C++ version --- cpp_version/src/main.cpp | 2 +- cpp_version/src/utility/ArgumentHandler.cpp | 31 ++++++++++++++++++--- cpp_version/src/utility/ArgumentHandler.h | 1 + 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/cpp_version/src/main.cpp b/cpp_version/src/main.cpp index b4b9d0f0d..79b2b1c38 100644 --- a/cpp_version/src/main.cpp +++ b/cpp_version/src/main.cpp @@ -55,7 +55,7 @@ void run_ranger(const ArgumentHandler& arg_handler, std::ostream& verbose_out) { arg_handler.predict, arg_handler.impmeasure, arg_handler.targetpartitionsize, arg_handler.minbucket, arg_handler.splitweights, arg_handler.alwayssplitvars, arg_handler.statusvarname, arg_handler.replace, arg_handler.catvars, arg_handler.savemem, arg_handler.splitrule, arg_handler.caseweights, arg_handler.predall, arg_handler.fraction, - arg_handler.alpha, arg_handler.minprop, arg_handler.holdout, arg_handler.predictiontype, + arg_handler.alpha, arg_handler.minprop, arg_handler.tau, arg_handler.holdout, arg_handler.predictiontype, arg_handler.randomsplits, arg_handler.maxdepth, arg_handler.regcoef, arg_handler.usedepth); forest->run(true, !arg_handler.skipoob); diff --git a/cpp_version/src/utility/ArgumentHandler.cpp b/cpp_version/src/utility/ArgumentHandler.cpp index 1da4743f4..d5b0a6a53 100644 --- a/cpp_version/src/utility/ArgumentHandler.cpp +++ b/cpp_version/src/utility/ArgumentHandler.cpp @@ -21,7 +21,7 @@ namespace ranger { ArgumentHandler::ArgumentHandler(int argc, char **argv) : caseweights(""), depvarname(""), fraction(0), holdout(false), memmode(MEM_DOUBLE), savemem(false), skipoob(false), predict( - ""), predictiontype(DEFAULT_PREDICTIONTYPE), randomsplits(DEFAULT_NUM_RANDOM_SPLITS), splitweights(""), nthreads( + ""), predictiontype(DEFAULT_PREDICTIONTYPE), randomsplits(DEFAULT_NUM_RANDOM_SPLITS), splitweights(""), tau(DEFAULT_POISSON_TAU), nthreads( DEFAULT_NUM_THREADS), predall(false), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), maxdepth( DEFAULT_MAXDEPTH), file(""), impmeasure(DEFAULT_IMPORTANCE_MODE), targetpartitionsize(0), minbucket(0), mtry(0), outprefix( "ranger_out"), probability(false), splitrule(DEFAULT_SPLITRULE), statusvarname(""), ntree(DEFAULT_NUM_TREE), replace( @@ -33,7 +33,7 @@ ArgumentHandler::ArgumentHandler(int argc, char **argv) : int ArgumentHandler::processArguments() { // short options - char const *short_options = "A:C:D:F:HM:NOP:Q:R:S:U:XZa:b:c:d:f:hi:j:kl:m:n:o:pr:s:t:uvwy:z:"; + char const *short_options = "A:C:D:F:HM:NOP:Q:R:S:T:U:XZa:b:c:d:f:hi:j:kl:m:n:o:pr:s:t:uvwy:z:"; // long options: longname, no/optional/required argument?, flag(not used!), shortname const struct option long_options[] = { @@ -50,6 +50,7 @@ int ArgumentHandler::processArguments() { { "predictiontype", required_argument, 0, 'Q'}, { "randomsplits", required_argument, 0, 'R'}, { "splitweights", required_argument, 0, 'S'}, + { "tau", required_argument, 0, 'T'}, { "nthreads", required_argument, 0, 'U'}, { "predall", no_argument, 0, 'X'}, { "version", no_argument, 0, 'Z'}, @@ -178,6 +179,20 @@ int ArgumentHandler::processArguments() { case 'S': splitweights = optarg; break; + + case 'T': + try { + double temp = std::stod(optarg); + if (temp <= 0) { + throw std::runtime_error(""); + } else { + tau = temp; + } + } catch (...) { + throw std::runtime_error( + "Illegal argument for option 'tau'. Please give a positive value. See '--help' for details."); + } + break; case 'U': try { @@ -352,6 +367,9 @@ int ArgumentHandler::processArguments() { case 7: splitrule = HELLINGER; break; + case 8: + splitrule = POISSON; + break; default: throw std::runtime_error(""); break; @@ -512,7 +530,8 @@ void ArgumentHandler::checkArguments() { if (((splitrule == AUC || splitrule == AUC_IGNORE_TIES) && treetype != TREE_SURVIVAL) || (splitrule == MAXSTAT && (treetype != TREE_SURVIVAL && treetype != TREE_REGRESSION)) || (splitrule == BETA && treetype != TREE_REGRESSION) - || (splitrule == HELLINGER && treetype != TREE_CLASSIFICATION && treetype != TREE_PROBABILITY)) { + || (splitrule == HELLINGER && treetype != TREE_CLASSIFICATION && treetype != TREE_PROBABILITY) + || (splitrule == POISSON && treetype != TREE_REGRESSION)) { throw std::runtime_error("Illegal splitrule selected. See '--help' for details."); } @@ -658,8 +677,9 @@ void ArgumentHandler::displayHelp() { << " RULE = 4: MAXSTAT for Survival and Regression, not available for Classification." << std::endl; std::cout << " " << " RULE = 5: ExtraTrees for all tree types." << std::endl; - std::cout << " " << " RULE = 6: BETA for regression, only for (0,1) bounded outcomes." << std::endl; + std::cout << " " << " RULE = 6: BETA for Regression, only for (0,1) bounded outcomes." << std::endl; std::cout << " " << " RULE = 7: Hellinger for Classification, not available for Regression and Survival." << std::endl; + std::cout << " " << " RULE = 8: Poisson for Regression, not available for Classification and Survival." << std::endl; std::cout << " " << " (Default: 1)" << std::endl; std::cout << " " << "--randomsplits N Number of random splits to consider for each splitting variable (ExtraTrees splitrule only)." @@ -670,6 +690,9 @@ void ArgumentHandler::displayHelp() { std::cout << " " << "--minprop VAL Lower quantile of covariate distribtuion to be considered for splitting (MAXSTAT splitrule only)." << std::endl; + std::cout << " " + << "--tau VAL Tau parameter for Poisson splitting (Poisson splitrule only)." + << std::endl; std::cout << " " << "--caseweights FILE Filename of case weights file." << std::endl; std::cout << " " << "--holdout Hold-out mode. Hold-out all samples with case weight 0 and use these for variable " diff --git a/cpp_version/src/utility/ArgumentHandler.h b/cpp_version/src/utility/ArgumentHandler.h index d6964093f..4395c95d9 100644 --- a/cpp_version/src/utility/ArgumentHandler.h +++ b/cpp_version/src/utility/ArgumentHandler.h @@ -60,6 +60,7 @@ class ArgumentHandler { PredictionType predictiontype; uint randomsplits; std::string splitweights; + double tau; uint nthreads; bool predall; From 6e9d42b9705a68a6e3a9ead1e72cfc8f17991ec3 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 11 Jun 2024 08:22:55 +0200 Subject: [PATCH 9/9] new version for Poisson splitting --- DESCRIPTION | 2 +- NEWS.md | 3 +++ cpp_version/src/version.h | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5e9257cc7..edb4ca139 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.16.1 +Version: 0.16.2 Date: 2024-05-16 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright diff --git a/NEWS.md b/NEWS.md index 00b3a1073..708b2316c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.16.2 +* Add Poisson splitting rule for regression trees + # ranger 0.16.1 * Set num.threads=2 as default; respect environment variables and options * Add hierarchical shrinkage diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index f22802771..673de5f2a 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.16.1" +#define RANGER_VERSION "0.16.2" #endif