diff --git a/DESCRIPTION b/DESCRIPTION index 5e9257cc..edb4ca13 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.16.1 +Version: 0.16.2 Date: 2024-05-16 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright diff --git a/NEWS.md b/NEWS.md index 00b3a107..708b2316 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.16.2 +* Add Poisson splitting rule for regression trees + # ranger 0.16.1 * Set num.threads=2 as default; respect environment variables and options * Add hierarchical shrinkage diff --git a/R/RcppExports.R b/R/RcppExports.R index dfac21ec..40c0ea72 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats, time_interest, use_time_interest) { - .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats, time_interest, use_time_interest) +rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats, time_interest, use_time_interest) { + .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats, time_interest, use_time_interest) } numSmaller <- function(values, reference) { diff --git a/R/predict.R b/R/predict.R index d11c453e..76a00c97 100644 --- a/R/predict.R +++ b/R/predict.R @@ -237,6 +237,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, splitrule <- 1 alpha <- 0 minprop <- 0 + poisson.tau <- 1 case.weights <- c(0, 0) use.case.weights <- FALSE class.weights <- c(0, 0) @@ -276,7 +277,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, prediction.mode, forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule, case.weights, use.case.weights, class.weights, - predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, + predict.all, keep.inbag, sample.fraction, alpha, minprop, poisson.tau, holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, regularization.factor, use.regularization.factor, regularization.usedepth, diff --git a/R/ranger.R b/R/ranger.R index 1cc11ff7..971368a0 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -116,10 +116,14 @@ ##' @param sample.fraction Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values. ##' @param case.weights Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees. ##' @param class.weights Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes. -##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". For regression "variance", "extratrees", "maxstat" or "beta" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank". +##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". +##' For regression "variance", "extratrees", "maxstat", "beta" or "poisson" with default "variance". +##' For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank". ##' @param num.random.splits For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable. ##' @param alpha For "maxstat" splitrule: Significance threshold to allow splitting. ##' @param minprop For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting. +##' @param poisson.tau For "poisson" splitrule: The coefficient of variation of the (expected) frequency is \eqn{1/\tau}. +##' If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parent) / (\tau + samples(child) mean(parent))}. ##' @param split.select.weights Numeric vector with weights between 0 and 1, used to calculate the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used. ##' @param always.split.variables Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting. ##' @param respect.unordered.factors Handling of unordered factor covariates. One of 'ignore', 'order' and 'partition'. For the "extratrees" splitrule the default is "partition" for all other splitrules 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used. See below for details. @@ -237,6 +241,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, replace = TRUE, sample.fraction = ifelse(replace, 1, 0.632), case.weights = NULL, class.weights = NULL, splitrule = NULL, num.random.splits = 1, alpha = 0.5, minprop = 0.1, + poisson.tau = 1, split.select.weights = NULL, always.split.variables = NULL, respect.unordered.factors = NULL, scale.permutation.importance = FALSE, @@ -818,6 +823,17 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if ((is.factor(y) && nlevels(y) > 2) || (length(unique(y)) > 2)) { stop("Error: Hellinger splitrule only implemented for binary classification.") } + } else if (splitrule == "poisson") { + if (treetype == 3) { + splitrule.num <- 8 + } else { + stop("Error: poisson splitrule applicable to regression data only.") + } + + ## Check for valid responses + if (min(y) < 0 || sum(y) <= 0) { + stop("Error: poisson splitrule applicable to regression data with non-positive outcome (y>=0 and sum(y)>0) only.") + } } else { stop("Error: Unknown splitrule.") } @@ -843,6 +859,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (num.random.splits > 1 && splitrule.num != 5) { warning("Argument 'num.random.splits' ignored if splitrule is not 'extratrees'.") } + + if (!is.numeric(poisson.tau) || poisson.tau <= 0) { + stop("Error: Invalid value for poisson.tau, please give a positive number.") + } ## Unordered factors if (respect.unordered.factors == "partition") { @@ -879,6 +899,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, stop("Error: Unordered factor splitting not implemented for 'C' splitting rule.") } else if (splitrule == "beta") { stop("Error: Unordered factor splitting not implemented for 'beta' splitting rule.") + } else if (splitrule == "poisson") { + stop("Error: Unordered factor splitting not implemented for 'poisson' splitting rule.") } } @@ -966,10 +988,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, prediction.mode, loaded.forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule.num, case.weights, use.case.weights, class.weights, - predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, - num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, - inbag, use.inbag, - regularization.factor, use.regularization.factor, regularization.usedepth, + predict.all, keep.inbag, sample.fraction, alpha, minprop, poisson.tau, + holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, + order.snps, oob.error, max.depth, inbag, use.inbag, + regularization.factor, use.regularization.factor, regularization.usedepth, node.stats, time.interest, use.time.interest) if (length(result) == 0) { diff --git a/cpp_version/src/main.cpp b/cpp_version/src/main.cpp index b4b9d0f0..79b2b1c3 100644 --- a/cpp_version/src/main.cpp +++ b/cpp_version/src/main.cpp @@ -55,7 +55,7 @@ void run_ranger(const ArgumentHandler& arg_handler, std::ostream& verbose_out) { arg_handler.predict, arg_handler.impmeasure, arg_handler.targetpartitionsize, arg_handler.minbucket, arg_handler.splitweights, arg_handler.alwayssplitvars, arg_handler.statusvarname, arg_handler.replace, arg_handler.catvars, arg_handler.savemem, arg_handler.splitrule, arg_handler.caseweights, arg_handler.predall, arg_handler.fraction, - arg_handler.alpha, arg_handler.minprop, arg_handler.holdout, arg_handler.predictiontype, + arg_handler.alpha, arg_handler.minprop, arg_handler.tau, arg_handler.holdout, arg_handler.predictiontype, arg_handler.randomsplits, arg_handler.maxdepth, arg_handler.regcoef, arg_handler.usedepth); forest->run(true, !arg_handler.skipoob); diff --git a/cpp_version/src/utility/ArgumentHandler.cpp b/cpp_version/src/utility/ArgumentHandler.cpp index 1da4743f..d5b0a6a5 100644 --- a/cpp_version/src/utility/ArgumentHandler.cpp +++ b/cpp_version/src/utility/ArgumentHandler.cpp @@ -21,7 +21,7 @@ namespace ranger { ArgumentHandler::ArgumentHandler(int argc, char **argv) : caseweights(""), depvarname(""), fraction(0), holdout(false), memmode(MEM_DOUBLE), savemem(false), skipoob(false), predict( - ""), predictiontype(DEFAULT_PREDICTIONTYPE), randomsplits(DEFAULT_NUM_RANDOM_SPLITS), splitweights(""), nthreads( + ""), predictiontype(DEFAULT_PREDICTIONTYPE), randomsplits(DEFAULT_NUM_RANDOM_SPLITS), splitweights(""), tau(DEFAULT_POISSON_TAU), nthreads( DEFAULT_NUM_THREADS), predall(false), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), maxdepth( DEFAULT_MAXDEPTH), file(""), impmeasure(DEFAULT_IMPORTANCE_MODE), targetpartitionsize(0), minbucket(0), mtry(0), outprefix( "ranger_out"), probability(false), splitrule(DEFAULT_SPLITRULE), statusvarname(""), ntree(DEFAULT_NUM_TREE), replace( @@ -33,7 +33,7 @@ ArgumentHandler::ArgumentHandler(int argc, char **argv) : int ArgumentHandler::processArguments() { // short options - char const *short_options = "A:C:D:F:HM:NOP:Q:R:S:U:XZa:b:c:d:f:hi:j:kl:m:n:o:pr:s:t:uvwy:z:"; + char const *short_options = "A:C:D:F:HM:NOP:Q:R:S:T:U:XZa:b:c:d:f:hi:j:kl:m:n:o:pr:s:t:uvwy:z:"; // long options: longname, no/optional/required argument?, flag(not used!), shortname const struct option long_options[] = { @@ -50,6 +50,7 @@ int ArgumentHandler::processArguments() { { "predictiontype", required_argument, 0, 'Q'}, { "randomsplits", required_argument, 0, 'R'}, { "splitweights", required_argument, 0, 'S'}, + { "tau", required_argument, 0, 'T'}, { "nthreads", required_argument, 0, 'U'}, { "predall", no_argument, 0, 'X'}, { "version", no_argument, 0, 'Z'}, @@ -178,6 +179,20 @@ int ArgumentHandler::processArguments() { case 'S': splitweights = optarg; break; + + case 'T': + try { + double temp = std::stod(optarg); + if (temp <= 0) { + throw std::runtime_error(""); + } else { + tau = temp; + } + } catch (...) { + throw std::runtime_error( + "Illegal argument for option 'tau'. Please give a positive value. See '--help' for details."); + } + break; case 'U': try { @@ -352,6 +367,9 @@ int ArgumentHandler::processArguments() { case 7: splitrule = HELLINGER; break; + case 8: + splitrule = POISSON; + break; default: throw std::runtime_error(""); break; @@ -512,7 +530,8 @@ void ArgumentHandler::checkArguments() { if (((splitrule == AUC || splitrule == AUC_IGNORE_TIES) && treetype != TREE_SURVIVAL) || (splitrule == MAXSTAT && (treetype != TREE_SURVIVAL && treetype != TREE_REGRESSION)) || (splitrule == BETA && treetype != TREE_REGRESSION) - || (splitrule == HELLINGER && treetype != TREE_CLASSIFICATION && treetype != TREE_PROBABILITY)) { + || (splitrule == HELLINGER && treetype != TREE_CLASSIFICATION && treetype != TREE_PROBABILITY) + || (splitrule == POISSON && treetype != TREE_REGRESSION)) { throw std::runtime_error("Illegal splitrule selected. See '--help' for details."); } @@ -658,8 +677,9 @@ void ArgumentHandler::displayHelp() { << " RULE = 4: MAXSTAT for Survival and Regression, not available for Classification." << std::endl; std::cout << " " << " RULE = 5: ExtraTrees for all tree types." << std::endl; - std::cout << " " << " RULE = 6: BETA for regression, only for (0,1) bounded outcomes." << std::endl; + std::cout << " " << " RULE = 6: BETA for Regression, only for (0,1) bounded outcomes." << std::endl; std::cout << " " << " RULE = 7: Hellinger for Classification, not available for Regression and Survival." << std::endl; + std::cout << " " << " RULE = 8: Poisson for Regression, not available for Classification and Survival." << std::endl; std::cout << " " << " (Default: 1)" << std::endl; std::cout << " " << "--randomsplits N Number of random splits to consider for each splitting variable (ExtraTrees splitrule only)." @@ -670,6 +690,9 @@ void ArgumentHandler::displayHelp() { std::cout << " " << "--minprop VAL Lower quantile of covariate distribtuion to be considered for splitting (MAXSTAT splitrule only)." << std::endl; + std::cout << " " + << "--tau VAL Tau parameter for Poisson splitting (Poisson splitrule only)." + << std::endl; std::cout << " " << "--caseweights FILE Filename of case weights file." << std::endl; std::cout << " " << "--holdout Hold-out mode. Hold-out all samples with case weight 0 and use these for variable " diff --git a/cpp_version/src/utility/ArgumentHandler.h b/cpp_version/src/utility/ArgumentHandler.h index d6964093..4395c95d 100644 --- a/cpp_version/src/utility/ArgumentHandler.h +++ b/cpp_version/src/utility/ArgumentHandler.h @@ -60,6 +60,7 @@ class ArgumentHandler { PredictionType predictiontype; uint randomsplits; std::string splitweights; + double tau; uint nthreads; bool predall; diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index f2280277..673de5f2 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.16.1" +#define RANGER_VERSION "0.16.2" #endif diff --git a/man/ranger.Rd b/man/ranger.Rd index 9f8036b7..d8883713 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -23,6 +23,7 @@ ranger( num.random.splits = 1, alpha = 0.5, minprop = 0.1, + poisson.tau = 1, split.select.weights = NULL, always.split.variables = NULL, respect.unordered.factors = NULL, @@ -78,7 +79,9 @@ ranger( \item{class.weights}{Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes.} -\item{splitrule}{Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". For regression "variance", "extratrees", "maxstat" or "beta" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank".} +\item{splitrule}{Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". +For regression "variance", "extratrees", "maxstat", "beta" or "poisson" with default "variance". +For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank".} \item{num.random.splits}{For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable.} @@ -86,6 +89,9 @@ ranger( \item{minprop}{For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting.} +\item{poisson.tau}{For "poisson" splitrule: The coefficient of variation of the (expected) frequency is \eqn{1/\tau}. +If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parent) / (\tau + samples(child) mean(parent))}.} + \item{split.select.weights}{Numeric vector with weights between 0 and 1, used to calculate the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.} \item{always.split.variables}{Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting.} diff --git a/src/Forest.cpp b/src/Forest.cpp index 7100cca3..f4999015 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -31,8 +31,8 @@ Forest::Forest() : 0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), memory_saving_splitting( false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( { 1 }), holdout( false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth( - DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), num_threads(DEFAULT_NUM_THREADS), data { }, overall_prediction_error( - NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false), progress(0) { + DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), poisson_tau(DEFAULT_POISSON_TAU), num_threads(DEFAULT_NUM_THREADS), data { }, + overall_prediction_error(NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false), progress(0) { } // #nocov start @@ -42,8 +42,8 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode std::string split_select_weights_file, const std::vector& always_split_variable_names, std::string status_variable_name, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, bool holdout, - PredictionType prediction_type, uint num_random_splits, uint max_depth, + std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, + double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { this->memory_mode = memory_mode; @@ -83,7 +83,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode // Call other init function init(loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, min_node_size_vector, min_bucket_vector, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, - splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, + splitrule, predict_all, sample_fraction_vector, alpha, minprop, poisson_tau, holdout, prediction_type, num_random_splits, false, max_depth, regularization_factor, regularization_usedepth, false); if (prediction_mode) { @@ -141,7 +141,7 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, bool keep_inbag, - std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, + std::vector& sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth, bool node_stats) { @@ -150,7 +150,7 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, // Call other init function init(std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, min_bucket, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, - predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth, + predict_all, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type, num_random_splits, order_snps, max_depth, regularization_factor, regularization_usedepth, node_stats); // Set variables to be always considered for splitting @@ -184,7 +184,7 @@ void Forest::init(std::unique_ptr input_data, uint mtry, std::string outpu uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, std::vector& min_node_size, std::vector& min_bucket, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, - double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, + double alpha, double minprop, double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth, bool node_stats) { // Initialize data with memmode @@ -222,6 +222,7 @@ void Forest::init(std::unique_ptr input_data, uint mtry, std::string outpu this->holdout = holdout; this->alpha = alpha; this->minprop = minprop; + this->poisson_tau = poisson_tau; this->prediction_type = prediction_type; this->num_random_splits = num_random_splits; this->max_depth = max_depth; @@ -477,7 +478,7 @@ void Forest::grow() { trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights, importance_mode, &min_node_size, &min_bucket, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, - tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, num_random_splits, max_depth, + tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, poisson_tau, holdout, num_random_splits, max_depth, ®ularization_factor, regularization_usedepth, &split_varIDs_used, save_node_stats); } diff --git a/src/Forest.h b/src/Forest.h index e9279154..05d6a2f9 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -45,7 +45,7 @@ class Forest { std::string status_variable_name, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, - bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, + double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); void initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, std::vector& min_node_size, std::vector& min_bucket, @@ -53,7 +53,7 @@ class Forest { const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, - bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, + bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth, bool node_stats); @@ -61,8 +61,8 @@ class Forest { uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, std::vector& min_node_size, std::vector& min_bucket, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, - double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, - bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth, + double alpha, double minprop, double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, + bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth, bool node_stats); virtual void initInternal() = 0; @@ -231,6 +231,9 @@ class Forest { // MAXSTAT splitrule double alpha; double minprop; + + // POISSON splitrule + double poisson_tau; // Multithreading uint num_threads; diff --git a/src/ForestRegression.cpp b/src/ForestRegression.cpp index 6328ac20..43a07c9d 100644 --- a/src/ForestRegression.cpp +++ b/src/ForestRegression.cpp @@ -66,6 +66,21 @@ void ForestRegression::initInternal() { } } } + + // Error if poisson splitrule used with negative data + if (splitrule == POISSON && !prediction_mode) { + double y_sum = 0; + for (size_t i = 0; i < num_samples; ++i) { + double y = data->get_y(i, 0); + y_sum += y; + if (y < 0) { + throw std::runtime_error("Poisson splitrule applicable to regression data with non-positive outcome (y>=0 and sum(y)>0) only."); + } + } + if (y_sum <= 0) { + throw std::runtime_error("Poisson splitrule applicable to regression data with non-positive outcome (y>=0 and sum(y)>0) only."); + } + } // Sort data if memory saving mode if (!memory_saving_splitting) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 0c9ac2bf..8bf96df7 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -13,8 +13,8 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // rangerCpp -Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, std::vector& min_node_size, std::vector& min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, bool node_stats, std::vector& time_interest, bool use_time_interest); -RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP min_bucketSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP, SEXP node_statsSEXP, SEXP time_interestSEXP, SEXP use_time_interestSEXP) { +Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, std::vector& min_node_size, std::vector& min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, bool node_stats, std::vector& time_interest, bool use_time_interest); +RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP min_bucketSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP poisson_tauSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP, SEXP node_statsSEXP, SEXP time_interestSEXP, SEXP use_time_interestSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -52,6 +52,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::vector& >::type sample_fraction(sample_fractionSEXP); Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< double >::type minprop(minpropSEXP); + Rcpp::traits::input_parameter< double >::type poisson_tau(poisson_tauSEXP); Rcpp::traits::input_parameter< bool >::type holdout(holdoutSEXP); Rcpp::traits::input_parameter< uint >::type prediction_type_r(prediction_type_rSEXP); Rcpp::traits::input_parameter< uint >::type num_random_splits(num_random_splitsSEXP); @@ -68,7 +69,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type node_stats(node_statsSEXP); Rcpp::traits::input_parameter< std::vector& >::type time_interest(time_interestSEXP); Rcpp::traits::input_parameter< bool >::type use_time_interest(use_time_interestSEXP); - rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats, time_interest, use_time_interest)); + rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats, time_interest, use_time_interest)); return rcpp_result_gen; END_RCPP } @@ -147,7 +148,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 50}, + {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 51}, {"_ranger_numSmaller", (DL_FUNC) &_ranger_numSmaller, 2}, {"_ranger_randomObsNode", (DL_FUNC) &_ranger_randomObsNode, 3}, {"_ranger_hshrink_regr", (DL_FUNC) &_ranger_hshrink_regr, 10}, diff --git a/src/Tree.cpp b/src/Tree.cpp index fd97f686..16abe0ee 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -18,11 +18,11 @@ namespace ranger { Tree::Tree() : mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), min_bucket(0), deterministic_varIDs(0), split_select_weights(0), case_weights( - 0), manual_inbag(0), oob_sampleIDs(0), save_node_stats(false), num_samples_nodes(0), node_predictions(0), - holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), + 0), manual_inbag(0), oob_sampleIDs(0), save_node_stats(false), num_samples_nodes(0), node_predictions(0), + holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), split_varIDs_used(0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement( true), sample_fraction(0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( - DEFAULT_MINPROP), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( + DEFAULT_MINPROP), poisson_tau(DEFAULT_POISSON_TAU), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( 0) { } @@ -34,7 +34,7 @@ Tree::Tree(std::vector>& child_nodeIDs, std::vector& holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), split_varIDs_used( 0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement(true), sample_fraction( 0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( - DEFAULT_MINPROP), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( + DEFAULT_MINPROP), poisson_tau(DEFAULT_POISSON_TAU), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( 0) { } @@ -42,7 +42,7 @@ void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std: std::vector* split_select_weights, ImportanceMode importance_mode, std::vector* min_node_size, std::vector* min_bucket, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, - double minprop, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, + double minprop, double poisson_tau, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, bool regularization_usedepth, std::vector* split_varIDs_used, bool save_node_stats) { this->data = data; @@ -73,6 +73,7 @@ void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std: this->holdout = holdout; this->alpha = alpha; this->minprop = minprop; + this->poisson_tau = poisson_tau; this->num_random_splits = num_random_splits; this->max_depth = max_depth; this->regularization_factor = regularization_factor; @@ -493,19 +494,19 @@ void Tree::bootstrapWeighted() { // Reserve space, reserve a little more to be save) sampleIDs.reserve(num_samples_inbag); oob_sampleIDs.reserve(num_samples * (exp(-(*sample_fraction)[0]) + 0.1)); - + std::discrete_distribution<> weighted_dist(case_weights->begin(), case_weights->end()); // Start with all samples OOB inbag_counts.resize(num_samples, 0); - + // Draw num_samples samples with replacement (n out of n) as inbag and mark as not OOB for (size_t s = 0; s < num_samples_inbag; ++s) { size_t draw = weighted_dist(random_number_generator); sampleIDs.push_back(draw); ++inbag_counts[draw]; } - + // Save OOB samples. In holdout mode these are the cases with 0 weight. if (holdout) { for (size_t s = 0; s < (*case_weights).size(); ++s) { diff --git a/src/Tree.h b/src/Tree.h index 17b98ff0..5f8a1dfe 100644 --- a/src/Tree.h +++ b/src/Tree.h @@ -39,7 +39,7 @@ class Tree { std::vector* split_select_weights, ImportanceMode importance_mode, std::vector* min_node_size, std::vector* min_bucket, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, - std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, + std::vector* sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, bool regularization_usedepth, std::vector* split_varIDs_used, bool save_node_stats); @@ -242,6 +242,7 @@ class Tree { SplitRule splitrule; double alpha; double minprop; + double poisson_tau; uint num_random_splits; uint max_depth; uint depth; diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index ec59528f..08dd350e 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -44,13 +44,48 @@ void TreeRegression::allocateMemory() { double TreeRegression::estimate(size_t nodeID) { // Mean of responses of samples in node - double sum_responses_in_node = 0; + double sum_responses_in_node = sumNodeResponse(nodeID); + size_t num_samples_in_node = end_pos[nodeID] - start_pos[nodeID]; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_responses_in_node += data->get_y(sampleID, 0); + if (splitrule == POISSON && sum_responses_in_node == 0.) { + // Poisson is not allowed to predict 0. + // We use a weighted average of parent and child mean values, + // see vignette "Introduction to Rpart" Chapter 8.2 and + // https://ssrn.com/abstract=2870308 Chapter 6.1.3 + + // Search for parent's nodeID: loop over all nodeIDs + size_t parent_nodeID = 0; + bool found = false; + // Loop over left child nodes + for(std::size_t i = 0; i < child_nodeIDs[0].size(); ++i) { + // Break if parent node found + if (child_nodeIDs[0][i] == nodeID) { + parent_nodeID = i; + found = true; + break; + } + } + if (!found) { + // Loop over right child nodes + for(std::size_t i = 0; i < child_nodeIDs[1].size(); ++i) { + // Break if parent node found + if (child_nodeIDs[1][i] == nodeID) { + parent_nodeID = i; + found = true; + break; + } + } + } + + double sum_responses_in_parent = sumNodeResponse(parent_nodeID); + size_t num_samples_in_parent = end_pos[parent_nodeID] - start_pos[parent_nodeID]; + double mean_node = (sum_responses_in_node / (double) num_samples_in_node); + double mean_parent = (sum_responses_in_parent / (double) num_samples_in_parent); + double alpha = num_samples_in_node * mean_parent/(num_samples_in_node * mean_parent + poisson_tau); + return alpha * mean_node + (1 - alpha) * mean_parent; + } else { + return (sum_responses_in_node / (double) num_samples_in_node); } - return (sum_responses_in_node / (double) num_samples_in_node); } void TreeRegression::appendToFileInternal(std::ofstream& file) { // #nocov start @@ -86,7 +121,11 @@ bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possi pure_value = value; } if (pure) { - split_values[nodeID] = pure_value; + if (splitrule == POISSON && pure_value == 0.) { + split_values[nodeID] = estimate(nodeID); + } else { + split_values[nodeID] = pure_value; + } return true; } @@ -98,6 +137,8 @@ bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possi stop = findBestSplitExtraTrees(nodeID, possible_split_varIDs); } else if (splitrule == BETA) { stop = findBestSplitBeta(nodeID, possible_split_varIDs); + } else if (splitrule == POISSON) { + stop = findBestSplitPoisson(nodeID, possible_split_varIDs); } else { stop = findBestSplit(nodeID, possible_split_varIDs); } @@ -143,11 +184,7 @@ bool TreeRegression::findBestSplit(size_t nodeID, std::vector& possible_ double best_value = 0; // Compute sum of responses in node - double sum_node = 0; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_node += data->get_y(sampleID, 0); - } + double sum_node = sumNodeResponse(nodeID); // Stop early if no split posssible if (num_samples_node >= 2 * (*min_bucket)[0]) { @@ -541,11 +578,7 @@ bool TreeRegression::findBestSplitExtraTrees(size_t nodeID, std::vector& double best_value = 0; // Compute sum of responses in node - double sum_node = 0; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_node += data->get_y(sampleID, 0); - } + double sum_node = sumNodeResponse(nodeID); // Stop early if no split posssible if (num_samples_node >= 2 * (*min_bucket)[0]) { @@ -786,11 +819,7 @@ bool TreeRegression::findBestSplitBeta(size_t nodeID, std::vector& possi double best_value = 0; // Compute sum of responses in node - double sum_node = 0; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_node += data->get_y(sampleID, 0); - } + double sum_node = sumNodeResponse(nodeID); // Stop early if no split posssible if (num_samples_node >= 2 * (*min_bucket)[0]) { @@ -962,18 +991,156 @@ void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double } } +bool TreeRegression::findBestSplitPoisson(size_t nodeID, std::vector& possible_split_varIDs) { + + size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; + double best_decrease = -std::numeric_limits::infinity(); + size_t best_varID = 0; + double best_value = 0; + + // Compute sum of responses in node + double sum_node = sumNodeResponse(nodeID); + + // Stop early if no split posssible + if (num_samples_node >= 2 * (*min_bucket)[0]) { + + // For all possible split variables find best split value + for (auto& varID : possible_split_varIDs) { + findBestSplitValuePoissonSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, + best_decrease); + } + } + + // Stop if no good split found + if (std::isinf(-best_decrease)) { + return true; + } + + // Save best values + split_varIDs[nodeID] = best_varID; + split_values[nodeID] = best_value; + + // Compute decrease of impurity for this node and add to variable importance if needed + if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { + addImpurityImportance(nodeID, best_varID, best_decrease); + } + + // Regularization + saveSplitVarID(best_varID); + + return false; +} + +void TreeRegression::findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID, double sum_node, + size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease) { + + // Create possible split values + std::vector possible_split_values; + data->getAllValues(possible_split_values, sampleIDs, varID, start_pos[nodeID], end_pos[nodeID]); + + // Try next variable if all equal for this + if (possible_split_values.size() < 2) { + return; + } + + // -1 because no split possible at largest value + const size_t num_splits = possible_split_values.size() - 1; + if (memory_saving_splitting) { + std::vector sums_right(num_splits); + std::vector n_right(num_splits); + findBestSplitValuePoissonSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, + possible_split_values, sums_right, n_right); + } else { + std::fill_n(sums.begin(), num_splits, 0); + std::fill_n(counter.begin(), num_splits, 0); + findBestSplitValuePoissonSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease, + possible_split_values, sums, counter); + } +} + +void TreeRegression::findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, + double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, + std::vector& sums, std::vector& counter) { + + // Sum and sample count for possbile splits + for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { + size_t sampleID = sampleIDs[pos]; + size_t idx = std::lower_bound(possible_split_values.begin(), possible_split_values.end(), + data->get_x(sampleID, varID)) - possible_split_values.begin(); + + sums[idx] += data->get_y(sampleID, 0); + ++counter[idx]; + } + + size_t n_left = 0; + double sum_left = 0; + + // Compute decrease in Poisson deviance for each possible split + for (size_t i = 0; i < possible_split_values.size() - 1; ++i) { + + // Stop if nothing here + if (counter[i] == 0) { + continue; + } + + n_left += counter[i]; + sum_left += sums[i]; + + // Stop if right child empty + size_t n_right = num_samples_node - n_left; + if (n_right == 0) { + break; + } + + // Stop if minimal bucket size reached + if (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0]) { + continue; + } + + // Compute mean + double sum_right = sum_node - sum_left; + double mean_right = sum_right / (double) n_right; + double mean_left = sum_left / (double) n_left; + + // Poisson deviance = 2 * (y_true * log(y_true/y_pred) + y_pred - y_true) + // decrease = - 1/2 * (sum_left(poisson_deviance) + sum_right(poisson_deviance)) + // = + sum_left(y) * log(mean_left) + sum_right(y) * log(mean_right) + const + 0 + // The smaller the deviance, the better => the larger the decrease, the better. + double decrease = xlogy(sum_left, mean_left) + xlogy(sum_right, mean_right); + + // Stop if no result + if (std::isnan(decrease)) { + continue; + } + + // Regularization + if (decrease > 0) { + regularize(decrease, varID); + } else { + regularizeNegative(decrease, varID); + } + + // If better than before, use this + if (decrease > best_decrease) { + best_value = (possible_split_values[i] + possible_split_values[i + 1]) / 2; + best_varID = varID; + best_decrease = decrease; + + // Use smaller value if average is numerically the same as the larger value + if (best_value == possible_split_values[i + 1]) { + best_value = possible_split_values[i]; + } + } + } +} + void TreeRegression::addImpurityImportance(size_t nodeID, size_t varID, double decrease) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; double best_decrease = decrease; if (splitrule != MAXSTAT) { - double sum_node = 0; - for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { - size_t sampleID = sampleIDs[pos]; - sum_node += data->get_y(sampleID, 0); - } - + double sum_node = sumNodeResponse(nodeID); double impurity_node = (sum_node * sum_node / (double) num_samples_node); // Account for the regularization diff --git a/src/TreeRegression.h b/src/TreeRegression.h index 84c224f6..b1352f7d 100644 --- a/src/TreeRegression.h +++ b/src/TreeRegression.h @@ -82,10 +82,26 @@ class TreeRegression: public Tree { void findBestSplitValueBeta(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, std::vector& sums_right, std::vector& n_right); + + bool findBestSplitPoisson(size_t nodeID, std::vector& possible_split_varIDs); + void findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, + double& best_value, size_t& best_varID, double& best_decrease); + void findBestSplitValuePoissonSmallQ(size_t nodeID, size_t varID, double sum_node, size_t num_samples_node, + double& best_value, size_t& best_varID, double& best_decrease, std::vector possible_split_values, + std::vector& sums, std::vector& counter); void addImpurityImportance(size_t nodeID, size_t varID, double decrease); double computePredictionMSE(); + + // Compute sum of responses in node. As in-class definition, this is inline by default. + double sumNodeResponse(size_t nodeID) { + double sum_node = 0; + for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { + sum_node += data->get_y(sampleIDs[pos], 0); + } + return sum_node; + } void cleanUpInternal() override { counter.clear(); diff --git a/src/globals.h b/src/globals.h index 4deebb7b..691b1e0d 100644 --- a/src/globals.h +++ b/src/globals.h @@ -60,7 +60,8 @@ enum SplitRule { MAXSTAT = 4, EXTRATREES = 5, BETA = 6, - HELLINGER = 7 + HELLINGER = 7, + POISSON = 8 }; // Prediction type @@ -85,6 +86,8 @@ const SplitRule DEFAULT_SPLITRULE = LOGRANK; const double DEFAULT_ALPHA = 0.5; const double DEFAULT_MINPROP = 0.1; +const double DEFAULT_POISSON_TAU = 1; + const uint DEFAULT_MAXDEPTH = 0; const PredictionType DEFAULT_PREDICTIONTYPE = RESPONSE; const uint DEFAULT_NUM_RANDOM_SPLITS = 1; diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index 66e34c0e..476a6ed1 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -57,8 +57,8 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, - std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, - uint num_random_splits, Eigen::SparseMatrix& sparse_x, + std::vector& sample_fraction, double alpha, double minprop, double poisson_tau, + bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, @@ -155,7 +155,7 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM forest->initR(std::move(data), mtry, num_trees, verbose_out, seed, num_threads, importance_mode, min_node_size, min_bucket, split_select_weights, always_split_variable_names, prediction_mode, sample_with_replacement, unordered_variable_names, save_memory, splitrule, case_weights, - inbag, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, + inbag, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type, num_random_splits, order_snps, max_depth, regularization_factor, regularization_usedepth, node_stats); // Load forest object if in prediction mode diff --git a/src/utility.h b/src/utility.h index de22237c..24b2b263 100644 --- a/src/utility.h +++ b/src/utility.h @@ -12,6 +12,7 @@ #ifndef UTILITY_H_ #define UTILITY_H_ +#include #include #include #include @@ -525,7 +526,21 @@ std::stringstream& readFromStream(std::stringstream& in, double& token); */ double betaLogLik(double y, double mean, double phi); -/* +/** + * Compute x * log(y) with 0 * log(..) equal to 0. + * @param x + * @param y + * @return x * log(y) + */ +inline double xlogy(double x, double y) { + if (x == 0) { + return 0; + } else { + return x * log(y); + } +} + +/** * Returns the natural logarithm of the absolute value of the gamma function of x. * @param x Parameter for the log-gamma function. */ diff --git a/tests/testthat/test_poissonsplit.R b/tests/testthat/test_poissonsplit.R new file mode 100644 index 00000000..2640d0b4 --- /dev/null +++ b/tests/testthat/test_poissonsplit.R @@ -0,0 +1,67 @@ +library(ranger) +context("ranger_poisson") + +# Generate poisson distributed outcome +set.seed(42) +n <- 1000 +p <- 4 +beta <- c(0, 0.1, -0.2, 0.3) +x <- replicate(p, runif(n)) +# Use exp(..) to keep it positive (and adds interactions). +# Add -1 to make it small as Poisson should be better for small frequencies. +lambda <- exp(-1 + as.vector(x %*% beta)) +y <- rpois(n, lambda) +df <- data.frame(y = y, x) + +# And a simple dataset with zero outcomes +df2 <- data.frame(y = c(0, 0, 0, 0, 0, 1, 2, 3, 4, 5), + x1 = c("a", "a", "a", "a", "a", "b", "b", "b", "b", "b"), + x2 = c(0, 0, 0, 0, 0, 1, 1, 1, 2, 2)) + +poisson_deviance <- function(y_true, y_pred) { + if (any(y_true == 0 & y_pred == 0)) { + stop("Error: Poisson deviance does not exist for y_pred == y_true == 0.") + } + pos <- y_true > 0 + dev <- y_pred + dev[pos] <- y_true[pos] * log(y_true[pos] / y_pred[pos]) - y_true[pos] + y_pred[pos] + return(2 * mean(dev)) +} + + +test_that("poisson splitting works on poisson distributed data", { + n_train = 1:(4*n %/% 5) + n_test = (max(n_train)+1):n + df_train = df[n_train, ] + df_test = df[n_test, ] + rf_poi <- ranger(y ~ ., df_train, splitrule = "poisson", num.trees = 50, min.node.size = 50, poisson.tau = 1, seed = 12) + rf_mse <- ranger(y ~ ., df_train, splitrule = "variance", num.trees = 50, min.node.size = 50, seed = 13) + + expect_is(rf_poi, "ranger") + # deviance on test set + expect_lt(poisson_deviance(df_test$y, predict(rf_poi, df_test)$predictions), + poisson_deviance(df_test$y, predict(rf_mse, df_test)$predictions)) +}) + +test_that("poisson splitting not working for negative outcome", { + expect_error(ranger(y ~ ., data.frame(y = c(-1.5, 2), x = c(1, 2)), splitrule = "poisson")) + expect_error(ranger(y ~ ., data.frame(y = c(0, 0), x = c(1, 2)), splitrule = "poisson")) +}) + +test_that("poisson.tau <= 0 throws error", { + expect_error(ranger(y ~ ., df2, poisson.tau = 0)) +}) + +test_that("poisson splitting predicts positive even on nodes with all values equal 0", { + rf <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 0.1, mtry = 2, num.trees = 2, + min.node.size = 1, seed = 123) + expect_true(all(c(predict(rf, df2, predict.all = TRUE)$predictions) > 0)) +}) + +test_that("poisson splitting gives larger predictions for larger values of poisson.tau on pure nodes with y = 0", { + rf1 <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 0.1, mtry = 2, num.trees = 2, + min.node.size = 1, seed = 123) + rf2 <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 10, mtry = 2, num.trees = 2, + min.node.size = 1, seed = 123) + expect_true(all(predict(rf2, df2)$predictions[df2$y == 0] > predict(rf1, df2)$predictions[df2$y == 0])) +})