imbs-hl · mnwright · Jun 11, 2024 · Mar 22, 2020 · Mar 22, 2020 · Mar 22, 2020
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,6 +17,6 @@ LinkingTo: Rcpp, RcppEigen
 Depends: R (>= 3.1)
 Suggests: survival, testthat
 Encoding: UTF-8
-RoxygenNote: 7.0.2
+RoxygenNote: 7.1.0
 URL: https://github.com/imbs-hl/ranger
 BugReports: https://github.com/imbs-hl/ranger/issues
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,8 +1,8 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) {
-    .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)
+rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) {
+    .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)
 }
 
 numSmaller <- function(values, reference) {

diff --git a/R/predict.R b/R/predict.R
@@ -233,6 +233,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
   splitrule <- 1
   alpha <- 0
   minprop <- 0
+  poisson.tau <- 1
   case.weights <- c(0, 0)
   use.case.weights <- FALSE
   class.weights <- c(0, 0)
@@ -269,7 +270,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
                       prediction.mode, forest, snp.data, replace, probability,
                       unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule,
                       case.weights, use.case.weights, class.weights, 
-                      predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, 
+                      predict.all, keep.inbag, sample.fraction, alpha, minprop, poisson.tau, holdout, 
                       prediction.type, num.random.splits, sparse.x, use.sparse.data,
                       order.snps, oob.error, max.depth, inbag, use.inbag, 
                       regularization.factor, use.regularization.factor, regularization.usedepth)

diff --git a/R/ranger.R b/R/ranger.R
@@ -98,10 +98,14 @@
 ##' @param sample.fraction Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values. 
 ##' @param case.weights Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.
 ##' @param class.weights Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes.
-##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini". For regression "variance", "extratrees", "maxstat" or "beta" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank". 
+##' @param splitrule Splitting rule. For classification and probability estimation "gini", "extratrees" or "hellinger" with default "gini".
+##'   For regression "variance", "extratrees", "maxstat", "beta" or "poisson" with default "variance".
+##'   For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank". 
 ##' @param num.random.splits For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable.
 ##' @param alpha For "maxstat" splitrule: Significance threshold to allow splitting.
 ##' @param minprop For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting.
+##' @param poisson.tau For "poisson" splitrule: The coefficient of variation of the (expected) frequency is \eqn{1/\tau}.
+##'   If a terminal node has only 0 responses, the estimate is set to \eqn{\alpha 0 + (1-\alpha) mean(parent)} with \eqn{\alpha = samples(child) mean(parend) / (\tau + samples(child) mean(parend))}.
 ##' @param split.select.weights Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.  
 ##' @param always.split.variables Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting.
 ##' @param respect.unordered.factors Handling of unordered factor covariates. One of 'ignore', 'order' and 'partition'. For the "extratrees" splitrule the default is "partition" for all other splitrules 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used. See below for details. 
@@ -213,6 +217,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
                    sample.fraction = ifelse(replace, 1, 0.632), 
                    case.weights = NULL, class.weights = NULL, splitrule = NULL, 
                    num.random.splits = 1, alpha = 0.5, minprop = 0.1,
+                   poisson.tau = 1,
                    split.select.weights = NULL, always.split.variables = NULL,
                    respect.unordered.factors = NULL,
                    scale.permutation.importance = FALSE,
@@ -729,6 +734,17 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
     if ((is.factor(y) && nlevels(y) > 2) || (length(unique(y)) > 2)) {
       stop("Error: Hellinger splitrule only implemented for binary classification.")
     }  
+  } else if (splitrule == "poisson") {
+    if (treetype == 3) {
+      splitrule.num <- 8
+    } else {
+      stop("Error: poisson splitrule applicable to regression data only.")
+    }
+
+    ## Check for valid responses
+    if (min(y) < 0 || sum(y) <= 0) {
+      stop("Error: poisson splitrule applicable to regression data with non-positive outcome (y>=0 and sum(y)>0) only.")
+    }
   } else {
     stop("Error: Unknown splitrule.")
   }
@@ -754,6 +770,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
   if (num.random.splits > 1 && splitrule.num != 5) {
     warning("Argument 'num.random.splits' ignored if splitrule is not 'extratrees'.")
   }
+
+  if (!is.numeric(poisson.tau) || poisson.tau <= 0) {
+    stop("Error: Invalid value for poisson.tau, please give a positive number.")
+  }
 
   ## Unordered factors  
   if (respect.unordered.factors == "partition") {
@@ -790,6 +810,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
       stop("Error: Unordered factor splitting not implemented for 'C' splitting rule.")
     } else if (splitrule == "beta") {
       stop("Error: Unordered factor splitting not implemented for 'beta' splitting rule.")
+    } else if (splitrule == "poisson") {
+      stop("Error: Unordered factor splitting not implemented for 'poisson' splitting rule.")
     }
   }
 
@@ -851,9 +873,9 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
                       prediction.mode, loaded.forest, snp.data,
                       replace, probability, unordered.factor.variables, use.unordered.factor.variables, 
                       save.memory, splitrule.num, case.weights, use.case.weights, class.weights, 
-                      predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, 
-                      num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, 
-                      inbag, use.inbag, 
+                      predict.all, keep.inbag, sample.fraction, alpha, minprop, poisson.tau,
+                      holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data,
+                      order.snps, oob.error, max.depth, inbag, use.inbag, 
                       regularization.factor, use.regularization.factor, regularization.usedepth)
 
   if (length(result) == 0) {

diff --git a/man/ranger.Rd b/man/ranger.Rd
diff --git a/src/Forest.cpp b/src/Forest.cpp
@@ -29,12 +29,14 @@
 namespace ranger {
 
 Forest::Forest() :
-    verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0), seed(0), num_samples(
-        0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), memory_saving_splitting(
-        false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( { 1 }), holdout(
-        false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(
-        DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), num_threads(DEFAULT_NUM_THREADS), data { }, overall_prediction_error(
-    NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false), progress(0) {
+    verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0),
+    seed(0), num_samples(0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true),
+    memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false),
+    sample_fraction( { 1 }), holdout(false), prediction_type(DEFAULT_PREDICTIONTYPE),
+    num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA),
+    minprop(DEFAULT_MINPROP), poisson_tau(DEFAULT_POISSON_TAU), num_threads(DEFAULT_NUM_THREADS), data { },
+    overall_prediction_error(NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), regularization_usedepth(false),
+    progress(0) {
 }
 
 // #nocov start
@@ -44,8 +46,8 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode
     std::string split_select_weights_file, const std::vector<std::string>& always_split_variable_names,
     std::string status_variable_name, bool sample_with_replacement,
     const std::vector<std::string>& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule,
-    std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, bool holdout,
-    PredictionType prediction_type, uint num_random_splits, uint max_depth,
+    std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop,
+    double poisson_tau, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth,
     const std::vector<double>& regularization_factor, bool regularization_usedepth) {
 
   this->verbose_out = verbose_out;
@@ -81,8 +83,8 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode
   // Call other init function
   init(memory_mode, loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode,
       min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting,
-      splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits,
-      false, max_depth, regularization_factor, regularization_usedepth);
+      splitrule, predict_all, sample_fraction_vector, alpha, minprop, poisson_tau, holdout, prediction_type,
+      num_random_splits, false, max_depth, regularization_factor, regularization_usedepth);
 
   if (prediction_mode) {
     loadFromFile(load_forest_filename);
@@ -139,17 +141,17 @@ void Forest::initR(std::unique_ptr<Data> input_data, uint mtry, uint num_trees,
     bool prediction_mode, bool sample_with_replacement, const std::vector<std::string>& unordered_variable_names,
     bool memory_saving_splitting, SplitRule splitrule, std::vector<double>& case_weights,
     std::vector<std::vector<size_t>>& manual_inbag, bool predict_all, bool keep_inbag,
-    std::vector<double>& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type,
-    uint num_random_splits, bool order_snps, uint max_depth, const std::vector<double>& regularization_factor,
-    bool regularization_usedepth) {
+    std::vector<double>& sample_fraction, double alpha, double minprop, double poisson_tau, bool holdout,
+    PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth,
+    const std::vector<double>& regularization_factor, bool regularization_usedepth) {
 
   this->verbose_out = verbose_out;
 
   // Call other init function
   init(MEM_DOUBLE, std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size,
       prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule,
-      predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth,
-      regularization_factor, regularization_usedepth);
+      predict_all, sample_fraction, alpha, minprop, poisson_tau, holdout, prediction_type, num_random_splits,
+      order_snps, max_depth, regularization_factor, regularization_usedepth);
 
   // Set variables to be always considered for splitting
   if (!always_split_variable_names.empty()) {
@@ -182,8 +184,9 @@ void Forest::init(MemoryMode memory_mode, std::unique_ptr<Data> input_data, uint
     uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size,
     bool prediction_mode, bool sample_with_replacement, const std::vector<std::string>& unordered_variable_names,
     bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector<double>& sample_fraction,
-    double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps,
-    uint max_depth, const std::vector<double>& regularization_factor, bool regularization_usedepth) {
+    double alpha, double minprop, double poisson_tau, bool holdout, PredictionType prediction_type,
+    uint num_random_splits, bool order_snps, uint max_depth, const std::vector<double>& regularization_factor,
+    bool regularization_usedepth) {
 
   // Initialize data with memmode
   this->data = std::move(input_data);
@@ -224,6 +227,7 @@ void Forest::init(MemoryMode memory_mode, std::unique_ptr<Data> input_data, uint
   this->holdout = holdout;
   this->alpha = alpha;
   this->minprop = minprop;
+  this->poisson_tau = poisson_tau;
   this->prediction_type = prediction_type;
   this->num_random_splits = num_random_splits;
   this->max_depth = max_depth;
@@ -476,10 +480,10 @@ void Forest::grow() {
       tree_manual_inbag = &manual_inbag[0];
     }
 
-    trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights,
-        importance_mode, min_node_size, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights,
-        tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, num_random_splits, max_depth,
-        &regularization_factor, regularization_usedepth, &split_varIDs_used);
+    trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs,
+        tree_split_select_weights, importance_mode, min_node_size, sample_with_replacement, memory_saving_splitting,
+        splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, poisson_tau, holdout,
+        num_random_splits, max_depth, &regularization_factor, regularization_usedepth, &split_varIDs_used);
   }
 
   // Init variable importance