Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Poisson splitting rule #495

Merged
merged 12 commits into from
Jun 11, 2024
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ LinkingTo: Rcpp, RcppEigen
Depends: R (>= 3.1)
Suggests: survival, testthat
Encoding: UTF-8
RoxygenNote: 7.0.2
RoxygenNote: 7.1.0
URL: https://github.com/imbs-hl/ranger
BugReports: https://github.com/imbs-hl/ranger/issues
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) {
.Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)
rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) {
.Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)
}

numSmaller <- function(values, reference) {
Expand Down
3 changes: 2 additions & 1 deletion R/predict.R
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
splitrule <- 1
alpha <- 0
minprop <- 0
poisson.tau <- 1
case.weights <- c(0, 0)
use.case.weights <- FALSE
class.weights <- c(0, 0)
Expand Down Expand Up @@ -269,7 +270,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
prediction.mode, forest, snp.data, replace, probability,
unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule,
case.weights, use.case.weights, class.weights,
predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout,
predict.all, keep.inbag, sample.fraction, alpha, minprop, poisson.tau, holdout,
prediction.type, num.random.splits, sparse.x, use.sparse.data,
order.snps, oob.error, max.depth, inbag, use.inbag,
regularization.factor, use.regularization.factor, regularization.usedepth)
Expand Down
30 changes: 26 additions & 4 deletions R/ranger.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,14 @@
##' @param sample.fraction Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values.
##' @param case.weights Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.
##' @param class.weights Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes.
##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". For regression "variance", "extratrees", "maxstat" or "beta" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank".
##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini".
##' For regression "variance", "extratrees", "maxstat", "beta" or "poisson" with default "variance".
##' For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank".
##' @param num.random.splits For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable.
##' @param alpha For "maxstat" splitrule: Significance threshold to allow splitting.
##' @param minprop For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting.
##' @param poisson.tau For "poisson" splitrule: The coefficient of variation of the (expected) frequency is \eqn{1/\tau}.
##' If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parend) / (\tau + samples(child) mean(parend))}.
mnwright marked this conversation as resolved.
Show resolved Hide resolved
##' @param split.select.weights Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.
##' @param always.split.variables Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting.
##' @param respect.unordered.factors Handling of unordered factor covariates. One of 'ignore', 'order' and 'partition'. For the "extratrees" splitrule the default is "partition" for all other splitrules 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used. See below for details.
Expand Down Expand Up @@ -213,6 +217,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
sample.fraction = ifelse(replace, 1, 0.632),
case.weights = NULL, class.weights = NULL, splitrule = NULL,
num.random.splits = 1, alpha = 0.5, minprop = 0.1,
poisson.tau = 1,
split.select.weights = NULL, always.split.variables = NULL,
respect.unordered.factors = NULL,
scale.permutation.importance = FALSE,
Expand Down Expand Up @@ -729,6 +734,17 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
if ((is.factor(y) && nlevels(y) > 2) || (length(unique(y)) > 2)) {
stop("Error: Hellinger splitrule only implemented for binary classification.")
}
} else if (splitrule == "poisson") {
if (treetype == 3) {
splitrule.num <- 8
} else {
stop("Error: poisson splitrule applicable to regression data only.")
}

## Check for valid responses
if (min(y) < 0 || sum(y) <= 0) {
stop("Error: poisson splitrule applicable to regression data with non-positive outcome (y>=0 and sum(y)>0) only.")
}
} else {
stop("Error: Unknown splitrule.")
}
Expand All @@ -754,6 +770,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
if (num.random.splits > 1 && splitrule.num != 5) {
warning("Argument 'num.random.splits' ignored if splitrule is not 'extratrees'.")
}

if (!is.numeric(poisson.tau) || poisson.tau <= 0) {
stop("Error: Invalid value for poisson.tau, please give a positive number.")
}

## Unordered factors
if (respect.unordered.factors == "partition") {
Expand Down Expand Up @@ -790,6 +810,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
stop("Error: Unordered factor splitting not implemented for 'C' splitting rule.")
} else if (splitrule == "beta") {
stop("Error: Unordered factor splitting not implemented for 'beta' splitting rule.")
} else if (splitrule == "poisson") {
stop("Error: Unordered factor splitting not implemented for 'poisson' splitting rule.")
}
}

Expand Down Expand Up @@ -851,9 +873,9 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
prediction.mode, loaded.forest, snp.data,
replace, probability, unordered.factor.variables, use.unordered.factor.variables,
save.memory, splitrule.num, case.weights, use.case.weights, class.weights,
predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type,
num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth,
inbag, use.inbag,
predict.all, keep.inbag, sample.fraction, alpha, minprop, poisson.tau,
holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data,
order.snps, oob.error, max.depth, inbag, use.inbag,
regularization.factor, use.regularization.factor, regularization.usedepth)

if (length(result) == 0) {
Expand Down
8 changes: 7 additions & 1 deletion man/ranger.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

46 changes: 25 additions & 21 deletions src/Forest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
namespace ranger {

Forest::Forest() :
verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0), seed(0), num_samples(
0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), memory_saving_splitting(
false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( { 1 }), holdout(
false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(
DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), num_threads(DEFAULT_NUM_THREADS), data { }, overall_prediction_error(
NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false), progress(0) {
verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0),
seed(0), num_samples(0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true),
memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false),
sample_fraction( { 1 }), holdout(false), prediction_type(DEFAULT_PREDICTIONTYPE),
num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA),
minprop(DEFAULT_MINPROP), poisson_tau(DEFAULT_POISSON_TAU), num_threads(DEFAULT_NUM_THREADS), data { },
overall_prediction_error(NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false),
progress(0) {
}

// #nocov start
Expand All @@ -44,8 +46,8 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode
std::string split_select_weights_file, const std::vector<std::string>& always_split_variable_names,
std::string status_variable_name, bool sample_with_replacement,
const std::vector<std::string>& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule,
std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, bool holdout,
PredictionType prediction_type, uint num_random_splits, uint max_depth,
std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop,
double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth,
const std::vector<double>& regularization_factor, bool regularization_usedepth) {

this->verbose_out = verbose_out;
Expand Down Expand Up @@ -81,8 +83,8 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode
// Call other init function
init(memory_mode, loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode,
min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting,
splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits,
false, max_depth, regularization_factor, regularization_usedepth);
splitrule, predict_all, sample_fraction_vector, alpha, minprop, poisson_tau, holdout, prediction_type,
num_random_splits, false, max_depth, regularization_factor, regularization_usedepth);

if (prediction_mode) {
loadFromFile(load_forest_filename);
Expand Down Expand Up @@ -139,17 +141,17 @@ void Forest::initR(std::unique_ptr<Data> input_data, uint mtry, uint num_trees,
bool prediction_mode, bool sample_with_replacement, const std::vector<std::string>& unordered_variable_names,
bool memory_saving_splitting, SplitRule splitrule, std::vector<double>& case_weights,
std::vector<std::vector<size_t>>& manual_inbag, bool predict_all, bool keep_inbag,
std::vector<double>& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type,
uint num_random_splits, bool order_snps, uint max_depth, const std::vector<double>& regularization_factor,
bool regularization_usedepth) {
std::vector<double>& sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout,
PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth,
const std::vector<double>& regularization_factor, bool regularization_usedepth) {

this->verbose_out = verbose_out;

// Call other init function
init(MEM_DOUBLE, std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size,
prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule,
predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth,
regularization_factor, regularization_usedepth);
predict_all, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type, num_random_splits,
order_snps, max_depth, regularization_factor, regularization_usedepth);

// Set variables to be always considered for splitting
if (!always_split_variable_names.empty()) {
Expand Down Expand Up @@ -182,8 +184,9 @@ void Forest::init(MemoryMode memory_mode, std::unique_ptr<Data> input_data, uint
uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size,
bool prediction_mode, bool sample_with_replacement, const std::vector<std::string>& unordered_variable_names,
bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector<double>& sample_fraction,
double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps,
uint max_depth, const std::vector<double>& regularization_factor, bool regularization_usedepth) {
double alpha, double minprop, double poisson_tau, bool holdout, PredictionType prediction_type,
uint num_random_splits, bool order_snps, uint max_depth, const std::vector<double>& regularization_factor,
bool regularization_usedepth) {

// Initialize data with memmode
this->data = std::move(input_data);
Expand Down Expand Up @@ -224,6 +227,7 @@ void Forest::init(MemoryMode memory_mode, std::unique_ptr<Data> input_data, uint
this->holdout = holdout;
this->alpha = alpha;
this->minprop = minprop;
this->poisson_tau = poisson_tau;
this->prediction_type = prediction_type;
this->num_random_splits = num_random_splits;
this->max_depth = max_depth;
Expand Down Expand Up @@ -476,10 +480,10 @@ void Forest::grow() {
tree_manual_inbag = &manual_inbag[0];
}

trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights,
importance_mode, min_node_size, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights,
tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, num_random_splits, max_depth,
&regularization_factor, regularization_usedepth, &split_varIDs_used);
trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs,
tree_split_select_weights, importance_mode, min_node_size, sample_with_replacement, memory_saving_splitting,
splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, poisson_tau, holdout,
num_random_splits, max_depth, &regularization_factor, regularization_usedepth, &split_varIDs_used);
}

// Init variable importance
Expand Down
Loading