From ee05131b62379a9d3622d73b894a26ed27f0cd4f Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 24 Jul 2020 12:15:39 +0200 Subject: [PATCH 001/111] fix probability class labels in treeInfo(), #536 --- DESCRIPTION | 6 +++--- NEWS | 3 +++ NEWS.md | 3 +++ R/treeInfo.R | 4 +++- cpp_version/src/version.h | 2 +- tests/testthat/test_treeInfo.R | 17 +++++++++++++++++ 6 files changed, 30 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 17827435f..1bd39cbd6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.12.3 -Date: 2020-05-08 +Version: 0.12.4 +Date: 2020-07-24 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high @@ -17,6 +17,6 @@ LinkingTo: Rcpp, RcppEigen Depends: R (>= 3.1) Suggests: survival, testthat Encoding: UTF-8 -RoxygenNote: 7.0.2 +RoxygenNote: 7.1.0 URL: https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/NEWS b/NEWS index ae052ebef..0a1acc36f 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,7 @@ +##### Version 0.12.4 +* Bug fixes + ##### Version 0.12.3 * Add ... argument to ranger() * Bug fixes diff --git a/NEWS.md b/NEWS.md index 7714caa31..2cc95defc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +##### Version 0.12.4 +* Bug fixes + ##### Version 0.12.3 * Add ... argument to ranger() * Bug fixes diff --git a/R/treeInfo.R b/R/treeInfo.R index 885a87194..a27682608 100644 --- a/R/treeInfo.R +++ b/R/treeInfo.R @@ -128,7 +128,9 @@ treeInfo <- function(object, tree = 1) { } else if (forest$treetype == "Probability estimation") { predictions <- matrix(nrow = nrow(result), ncol = length(forest$levels)) predictions[result$terminal, ] <- do.call(rbind, forest$terminal.class.counts[[tree]]) - colnames(predictions) <- paste0("pred.", forest$levels) + colnames(predictions) <- forest$levels[forest$class.values] + predictions <- predictions[, forest$levels, drop = FALSE] + colnames(predictions) <- paste0("pred.", colnames(predictions)) result <- data.frame(result, predictions) } else if (forest$treetype == "Survival") { # No prediction for survival (CHF too large?) diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index f256e6229..f5f4f23af 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.12.3" +#define RANGER_VERSION "0.12.4" #endif diff --git a/tests/testthat/test_treeInfo.R b/tests/testthat/test_treeInfo.R index 8af41b570..02b7d0a74 100644 --- a/tests/testthat/test_treeInfo.R +++ b/tests/testthat/test_treeInfo.R @@ -164,6 +164,23 @@ test_that("Prediction for probability is one probability per class, sum to 1", { expect_true(all(!ti.prob.formula$terminal | rowSums(ti.prob.formula[, 8:10]) == 1)) }) +test_that("Prediction for probability has correct factor levels", { + dat <- iris[c(101:150, 1:100), ] + rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 5, probability = TRUE) + + # Predict + pred_rf <- predict(rf, dat, num.trees = 1)$predictions + + # Predict with treeInfo + ti <- treeInfo(rf) + terminal_nodes <- predict(rf, dat, type = "terminalNodes")$predictions[, 1] + pred_ti <- as.matrix(ti[terminal_nodes + 1, grep("pred", colnames(ti))]) + colnames(pred_ti) <- gsub("pred\\.", "", colnames(pred_ti)) + rownames(pred_ti) <- NULL + + expect_equal(pred_rf, pred_ti) +}) + ## Survival rf.surv.formula <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) rf.surv.first <- ranger(dependent.variable.name = "time", status.variable.name = "status", data = veteran[, c(3:4, 1:2, 5:8)], num.trees = 5) From 43338070cfeda8e93deb9206585fd935f74e4c01 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 28 Jul 2020 14:17:59 +0200 Subject: [PATCH 002/111] Don't load depdendent variable name a second time, fix #538 --- src/Forest.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Forest.cpp b/src/Forest.cpp index 663dc2d4e..21ee35790 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -905,6 +905,7 @@ void Forest::loadDependentVariableNamesFromFile(std::string filename) { } // Read dependent variable names + dependent_variable_names.clear(); uint num_dependent_variables = 0; infile.read((char*) &num_dependent_variables, sizeof(num_dependent_variables)); for (size_t i = 0; i < num_dependent_variables; ++i) { From 2d8b00875c4977a3829379add31381169afcad6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lauren=C8=9Biu=20Nicola?= Date: Sat, 13 Mar 2021 09:52:03 +0200 Subject: [PATCH 003/111] Fix memory mode initialization --- src/Forest.cpp | 9 +++++---- src/Forest.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Forest.cpp b/src/Forest.cpp index 21ee35790..d4f59a1d2 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -48,6 +48,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { + this->memory_mode = memory_mode; this->verbose_out = verbose_out; if (!dependent_variable_name.empty()) { @@ -79,7 +80,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode } // Call other init function - init(memory_mode, loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, + init(loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, false, max_depth, regularization_factor, regularization_usedepth); @@ -143,10 +144,11 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { + this->memory_mode = memory_mode; this->verbose_out = verbose_out; // Call other init function - init(MEM_DOUBLE, std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, + init(std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth, regularization_factor, regularization_usedepth); @@ -178,7 +180,7 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, this->keep_inbag = keep_inbag; } -void Forest::init(MemoryMode memory_mode, std::unique_ptr input_data, uint mtry, std::string output_prefix, +void Forest::init(std::unique_ptr input_data, uint mtry, std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, @@ -214,7 +216,6 @@ void Forest::init(MemoryMode memory_mode, std::unique_ptr input_data, uint this->output_prefix = output_prefix; this->importance_mode = importance_mode; this->min_node_size = min_node_size; - this->memory_mode = memory_mode; this->prediction_mode = prediction_mode; this->sample_with_replacement = sample_with_replacement; this->memory_saving_splitting = memory_saving_splitting; diff --git a/src/Forest.h b/src/Forest.h index 485f74452..a1275b43e 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -58,7 +58,7 @@ class Forest { bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); - void init(MemoryMode memory_mode, std::unique_ptr input_data, uint mtry, std::string output_prefix, + void init(std::unique_ptr input_data, uint mtry, std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, From 6b8574e09b4e73473af5425b5a3d73d88cd976e8 Mon Sep 17 00:00:00 2001 From: Brandon Greenwell Date: Tue, 23 Mar 2021 17:10:54 -0400 Subject: [PATCH 004/111] Add `verbose` option to `csrf()` `csrf()` can be computationally demanding, and having an idea of the progress would be really beneficial. While there are many ways to accomplish this (e.g., progress bars, etc.), this approach does not require any changes to the core `csrf()` code or adds any dependencies. The default here is `FALSE`. --- R/csrf.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/R/csrf.R b/R/csrf.R index b2bea173b..758dbe6dc 100644 --- a/R/csrf.R +++ b/R/csrf.R @@ -45,6 +45,7 @@ ##' @param test_data Test data of class \code{data.frame}. ##' @param params1 Parameters for the proximity random forest grown in the first step. ##' @param params2 Parameters for the prediction random forests grown in the second step. +##' @param verbose Logical indicating whether or not to print computation progress. ##' ##' @return Predictions for the test dataset. ##' @@ -63,7 +64,7 @@ ##' @references ##' Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \url{https://doi.org/10.1080/10618600.2014.983641}. ##' @export -csrf <- function(formula, training_data, test_data, params1 = list(), params2 = list()) { +csrf <- function(formula, training_data, test_data, params1 = list(), params2 = list(), verbose = FALSE) { ## Grow a random forest on the training data to obtain weights rf.proximity <- do.call(ranger, c(list(formula = formula, data = training_data, write.forest = TRUE), params1)) @@ -74,6 +75,13 @@ csrf <- function(formula, training_data, test_data, params1 = list(), params2 = ## Grow weighted RFs for test observations, predict the outcome predictions <- sapply(1:nrow(test_data), function(i) { + ## Print computation progress + if (isTRUE(verbose)) { + message("Computing case-specific prediction for test observation ", + i, " of ", nrow(test_data), ". (", round(i / nrow(test_data) * 100, digits = 2), + "% complete.)") + } + ## Compute weights from first RF num.same.node <- rowSums(terminal.nodeIDs.test[i, ] == terminal.nodeIDs.train) weights <- num.same.node / sum(num.same.node) From e7a72bbbf98bd24f9bca44fc864cea132eeeb72e Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 26 Mar 2021 07:54:34 +0100 Subject: [PATCH 005/111] doc changes --- man/csrf.Rd | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/man/csrf.Rd b/man/csrf.Rd index dd3a9f917..1a7617388 100644 --- a/man/csrf.Rd +++ b/man/csrf.Rd @@ -4,7 +4,14 @@ \alias{csrf} \title{Case-specific random forests.} \usage{ -csrf(formula, training_data, test_data, params1 = list(), params2 = list()) +csrf( + formula, + training_data, + test_data, + params1 = list(), + params2 = list(), + verbose = FALSE +) } \arguments{ \item{formula}{Object of class \code{formula} or \code{character} describing the model to fit.} @@ -16,6 +23,8 @@ csrf(formula, training_data, test_data, params1 = list(), params2 = list()) \item{params1}{Parameters for the proximity random forest grown in the first step.} \item{params2}{Parameters for the prediction random forests grown in the second step.} + +\item{verbose}{Logical indicating whether or not to print computation progress.} } \value{ Predictions for the test dataset. From 0eedd9c97f2a6af2062c82b617a4acd327a33220 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 3 May 2021 07:04:44 +0200 Subject: [PATCH 006/111] add GH actions standard workflow --- .Rbuildignore | 1 + .github/.gitignore | 1 + .github/workflows/R-CMD-check.yaml | 86 ++++++++++++++++++++++++++++++ README.md | 1 + 4 files changed, 89 insertions(+) create mode 100644 .github/.gitignore create mode 100644 .github/workflows/R-CMD-check.yaml diff --git a/.Rbuildignore b/.Rbuildignore index b407c44c7..8e13d3f1c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^\.travis.yml$ ^cpp\_version$ ^appveyor\.yml$ +^\.github$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 000000000..2d19fc766 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 000000000..fb7b3702a --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,86 @@ +# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. +# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: windows-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + with: + r-version: ${{ matrix.config.r }} + + - uses: r-lib/actions/setup-pandoc@v1 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + if: runner.os != 'Windows' + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@main + with: + name: ${{ runner.os }}-r${{ matrix.config.r }}-results + path: check diff --git a/README.md b/README.md index bb0a9af7e..fddacd60e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +[![R-CMD-check](https://github.com/imbs-hl/ranger/workflows/R-CMD-check/badge.svg)](https://github.com/imbs-hl/ranger/actions) [![Travis Build Status](https://travis-ci.org/imbs-hl/ranger.svg?branch=master)](https://travis-ci.org/imbs-hl/ranger) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/imbs-hl/ranger?branch=master&svg=true)](https://ci.appveyor.com/project/mnwright/ranger) [![Coverage Status](https://coveralls.io/repos/github/imbs-hl/ranger/badge.svg?branch=master)](https://coveralls.io/github/imbs-hl/ranger?branch=master) From 8e293ab5802649a54f23cc6e31ddc4f22ce32d4f Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 3 May 2021 09:04:45 +0200 Subject: [PATCH 007/111] remove self assignment --- src/Forest.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Forest.cpp b/src/Forest.cpp index d4f59a1d2..f914d44ee 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -144,7 +144,6 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { - this->memory_mode = memory_mode; this->verbose_out = verbose_out; // Call other init function From 4677e69a05cfddfd0631573ba9fee03694250d0b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 3 May 2021 09:08:22 +0200 Subject: [PATCH 008/111] add oldrel check --- .github/workflows/R-CMD-check.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index fb7b3702a..8bcc15c48 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -24,6 +24,7 @@ jobs: config: - {os: windows-latest, r: 'release'} - {os: macOS-latest, r: 'release'} + - {os: ubuntu-20.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} From 81de34a0dc110948cd7a536fe686a7157f5da83b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 3 May 2021 09:20:52 +0200 Subject: [PATCH 009/111] add codecov coverage workflow --- .github/workflows/test-coverage.yaml | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 .github/workflows/test-coverage.yaml diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 000000000..ba1f94fde --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1,48 @@ +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +name: test-coverage + +jobs: + test-coverage: + runs-on: macOS-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + + - uses: r-lib/actions/setup-pandoc@v1 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install dependencies + run: | + install.packages(c("remotes")) + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("covr") + shell: Rscript {0} + + - name: Test coverage + run: covr::codecov() + shell: Rscript {0} From 37dd8355eefda169d3a9a5e8b8a6d0f568e9eaa6 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 3 May 2021 09:25:29 +0200 Subject: [PATCH 010/111] remove old CI and switch to codecov --- .Rbuildignore | 3 +-- .travis.yml | 20 -------------------- DESCRIPTION | 5 ++++- README.md | 4 +--- appveyor.yml | 42 ------------------------------------------ codecov.yml | 14 ++++++++++++++ 6 files changed, 20 insertions(+), 68 deletions(-) delete mode 100644 .travis.yml delete mode 100644 appveyor.yml create mode 100644 codecov.yml diff --git a/.Rbuildignore b/.Rbuildignore index 8e13d3f1c..0c7757356 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,7 +4,6 @@ ^NEWS\.md$ ^README\.md$ ^.*\.tar.gz$ -^\.travis.yml$ ^cpp\_version$ -^appveyor\.yml$ ^\.github$ +^codecov\.yml$ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index f607c6345..000000000 --- a/.travis.yml +++ /dev/null @@ -1,20 +0,0 @@ -language: r -cache: packages -warnings_are_errors: true -r: - - oldrel - - release - - devel - -r_github_packages: - - r-lib/covr - -after_success: - - Rscript -e 'covr::coveralls()' - -notifications: - email: - recipients: - secure: qh6JWWLG+P2D9MstM/7zk9C1eZEehdgMA3qIRMfa+gW02YPMXJK7+I1CTPPvVzVQP7iQJKavC+LWoYKEPWdCyu0PzbzDI50PBArPbsOHNF0ZKGA+z0bFsRlCo/BzStdSO/bg3m60+zo6BCPSpPWqGWsnED4Wb2YGf20M7TLUeKZ2htmcIgQx5VmfOPTZG0lh8u3/c4SFM9jF7jxjuV3QI8C1gnvFfidic24OJH91NvIajIttWsFU7bSRLM0eKbutWFnuqPRDjVIw62zQsigXJjy3wo6Yo2CY41X95wAu88AOgO14i4GyRM6Hv7juRLJJfuts8OFUMtG8btzemx2fBx81YUeG1QRD1Puxax61elDDJtNALEghzAscAnguUoTUCdUpw9ras1nIf8R+HDErSZd9ZEJ0QpGBGJBbl5pe/4V2XWo9EPhvQe8pAO+3iXIsFnEcItxBBwe0tJqCnW+pdTd3N99szdCKP6HyuV+8SJqy7tgilBFJZpC3SGQ1ynP4FRSF59ekLUtAGlm1RYGnUTMtVAWHZVQOMmLKbZfPW3JqUwVuaMCzG7XOFEgCtpT1VmIcggvyeGMMCdXPlW4Ns42tPuSEEcxo7kmVo7v5SDIWdGu6Znqbrs9UG17GqL7oLehQsGnrbGMIuLu3CNW+5/6m7YrO45lsi4nqHaqEW8Y= - on_success: change - on_failure: change diff --git a/DESCRIPTION b/DESCRIPTION index 1bd39cbd6..31b1b32f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,10 @@ License: GPL-3 Imports: Rcpp (>= 0.11.2), Matrix LinkingTo: Rcpp, RcppEigen Depends: R (>= 3.1) -Suggests: survival, testthat +Suggests: + covr, + survival, + testthat Encoding: UTF-8 RoxygenNote: 7.1.0 URL: https://github.com/imbs-hl/ranger diff --git a/README.md b/README.md index fddacd60e..eb1f00954 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ [![R-CMD-check](https://github.com/imbs-hl/ranger/workflows/R-CMD-check/badge.svg)](https://github.com/imbs-hl/ranger/actions) -[![Travis Build Status](https://travis-ci.org/imbs-hl/ranger.svg?branch=master)](https://travis-ci.org/imbs-hl/ranger) -[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/imbs-hl/ranger?branch=master&svg=true)](https://ci.appveyor.com/project/mnwright/ranger) -[![Coverage Status](https://coveralls.io/repos/github/imbs-hl/ranger/badge.svg?branch=master)](https://coveralls.io/github/imbs-hl/ranger?branch=master) +[![Codecov test coverage](https://codecov.io/gh/imbs-hl/ranger/branch/master/graph/badge.svg)](https://codecov.io/gh/imbs-hl/ranger?branch=master) ![CRAN Downloads month](http://cranlogs.r-pkg.org/badges/ranger?color=brightgreen) ![CRAN Downloads overall](http://cranlogs.r-pkg.org/badges/grand-total/ranger?color=brightgreen) ## ranger: A Fast Implementation of Random Forests diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index e32d316cc..000000000 --- a/appveyor.yml +++ /dev/null @@ -1,42 +0,0 @@ -# DO NOT CHANGE the "init" and "install" sections below - -# Download script file from GitHub -init: - ps: | - $ErrorActionPreference = "Stop" - Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" - Import-Module '..\appveyor-tool.ps1' - -install: - ps: Bootstrap - -# Adapt as necessary starting from here - -build_script: - - travis-tool.sh install_deps - -test_script: - - travis-tool.sh run_tests - -on_failure: - - 7z a failure.zip *.Rcheck\* - - appveyor PushArtifact failure.zip - -artifacts: - - path: '*.Rcheck\**\*.log' - name: Logs - - - path: '*.Rcheck\**\*.out' - name: Logs - - - path: '*.Rcheck\**\*.fail' - name: Logs - - - path: '*.Rcheck\**\*.Rout' - name: Logs - - - path: '\*_*.tar.gz' - name: Bits - - - path: '\*_*.zip' - name: Bits diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 000000000..04c558599 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,14 @@ +comment: false + +coverage: + status: + project: + default: + target: auto + threshold: 1% + informational: true + patch: + default: + target: auto + threshold: 1% + informational: true From 8bd52a5a6e8e75f11d8299e459136e14e52137ff Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 3 May 2021 09:35:45 +0200 Subject: [PATCH 011/111] update date --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 31b1b32f1..ff46f4958 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests Version: 0.12.4 -Date: 2020-07-24 +Date: 2021-05-03 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high From e509518666666b108c371e72cd979a812ac59717 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 3 May 2021 09:52:02 +0200 Subject: [PATCH 012/111] remove codecov again --- .Rbuildignore | 1 - .github/workflows/test-coverage.yaml | 48 ---------------------------- README.md | 1 - codecov.yml | 14 -------- 4 files changed, 64 deletions(-) delete mode 100644 .github/workflows/test-coverage.yaml delete mode 100644 codecov.yml diff --git a/.Rbuildignore b/.Rbuildignore index 0c7757356..c8160463c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,4 +6,3 @@ ^.*\.tar.gz$ ^cpp\_version$ ^\.github$ -^codecov\.yml$ diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml deleted file mode 100644 index ba1f94fde..000000000 --- a/.github/workflows/test-coverage.yaml +++ /dev/null @@ -1,48 +0,0 @@ -on: - push: - branches: - - main - - master - pull_request: - branches: - - main - - master - -name: test-coverage - -jobs: - test-coverage: - runs-on: macOS-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - steps: - - uses: actions/checkout@v2 - - - uses: r-lib/actions/setup-r@v1 - - - uses: r-lib/actions/setup-pandoc@v1 - - - name: Query dependencies - run: | - install.packages('remotes') - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") - shell: Rscript {0} - - - name: Restore R package cache - uses: actions/cache@v2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- - - - name: Install dependencies - run: | - install.packages(c("remotes")) - remotes::install_deps(dependencies = TRUE) - remotes::install_cran("covr") - shell: Rscript {0} - - - name: Test coverage - run: covr::codecov() - shell: Rscript {0} diff --git a/README.md b/README.md index eb1f00954..d31e62821 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ [![R-CMD-check](https://github.com/imbs-hl/ranger/workflows/R-CMD-check/badge.svg)](https://github.com/imbs-hl/ranger/actions) -[![Codecov test coverage](https://codecov.io/gh/imbs-hl/ranger/branch/master/graph/badge.svg)](https://codecov.io/gh/imbs-hl/ranger?branch=master) ![CRAN Downloads month](http://cranlogs.r-pkg.org/badges/ranger?color=brightgreen) ![CRAN Downloads overall](http://cranlogs.r-pkg.org/badges/grand-total/ranger?color=brightgreen) ## ranger: A Fast Implementation of Random Forests diff --git a/codecov.yml b/codecov.yml deleted file mode 100644 index 04c558599..000000000 --- a/codecov.yml +++ /dev/null @@ -1,14 +0,0 @@ -comment: false - -coverage: - status: - project: - default: - target: auto - threshold: 1% - informational: true - patch: - default: - target: auto - threshold: 1% - informational: true From 0af4e220dc21a5406840002185002decfca9d696 Mon Sep 17 00:00:00 2001 From: Lukas Burk Date: Wed, 12 May 2021 13:06:21 +0200 Subject: [PATCH 013/111] Add basic pkgdown infrastructure --- .Rbuildignore | 3 +++ .github/workflows/pkgdown.yaml | 48 ++++++++++++++++++++++++++++++++++ .gitignore | 3 ++- _pkgdown.yml | 0 4 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pkgdown.yaml create mode 100644 _pkgdown.yml diff --git a/.Rbuildignore b/.Rbuildignore index c8160463c..35db0f295 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,3 +6,6 @@ ^.*\.tar.gz$ ^cpp\_version$ ^\.github$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 000000000..1abece4b9 --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,48 @@ +on: + push: + branches: + - main + - master + +name: pkgdown + +jobs: + pkgdown: + runs-on: macOS-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + + - uses: r-lib/actions/setup-pandoc@v1 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + install.packages("pkgdown", type = "binary") + shell: Rscript {0} + + - name: Install package + run: R CMD INSTALL . + + - name: Deploy package + run: | + git config --local user.email "actions@github.com" + git config --local user.name "GitHub Actions" + Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' diff --git a/.gitignore b/.gitignore index 24924d6cc..96f14cc59 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ ranger.Rcheck/ *.Rproj .project -.cproject \ No newline at end of file +.cproject +docs diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 000000000..e69de29bb From 0038c7de35be8b6733fc4ea667bfd02df99e8c14 Mon Sep 17 00:00:00 2001 From: Lukas Burk Date: Wed, 12 May 2021 13:17:14 +0200 Subject: [PATCH 014/111] Reformat NEWS.md for pkgdown --- NEWS.md | 185 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 92 insertions(+), 93 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2cc95defc..09af082d9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,251 +1,250 @@ - -##### Version 0.12.4 +# ranger 0.12.4 * Bug fixes -##### Version 0.12.3 +# ranger 0.12.3 * Add ... argument to ranger() * Bug fixes -##### Version 0.12.2 +# ranger 0.12.2 * Bug fixes -##### Version 0.12.1 +# ranger 0.12.1 * Bug fixes -##### Version 0.12.0 +# ranger 0.12.0 * New CRAN version -##### Version 0.11.8 +# ranger 0.11.8 * Add regularization * Faster computation (in some cases) -##### Version 0.11.7 +# ranger 0.11.7 * Add local variable importance -##### Version 0.11.6 +# ranger 0.11.6 * Add "hellinger" splitrule for binary classification -##### Version 0.11.5 +# ranger 0.11.5 * Add x/y interface * Internal changes (seed differences possible, prediction incompatible with older versions) -##### Version 0.11.4 +# ranger 0.11.4 * Add "beta" splitrule for bounded outcomes -##### Version 0.11.3 +# ranger 0.11.3 * Accept user-specified function in quantile prediction -##### Version 0.11.2 +# ranger 0.11.2 * Bug fixes -##### Version 0.11.1 +# ranger 0.11.1 * Bug fixes -##### Version 0.11.0 +# ranger 0.11.0 * New CRAN version -##### Version 0.10.6 +# ranger 0.10.6 * Internal changes (slightly improved computation speed) * Warning: Possible seed differences compared to older versions * Bug fixes -##### Version 0.10.5 +# ranger 0.10.5 * Add support of splitting weights for corrected impurity importance * Bug fixes -##### Version 0.10.4 +# ranger 0.10.4 * Add inbag argument for manual selection of observations in trees -##### Version 0.10.3 +# ranger 0.10.3 * Bug fixes -##### Version 0.10.2 +# ranger 0.10.2 * Add max.depth parameter to limit tree depth -##### Version 0.10.1 +# ranger 0.10.1 * Bug fixes -##### Version 0.10.0 +# ranger 0.10.0 * New CRAN version -##### Version 0.9.12 +# ranger 0.9.12 * Remove GenABEL from suggested packages (removed from CRAN). GenABEL data is still supported -##### Version 0.9.11 +# ranger 0.9.11 * Improve memory management (internal changes) -##### Version 0.9.10 +# ranger 0.9.10 * Add impurity importance for the maxstat splitting rule * Bug fixes -##### Version 0.9.9 +# ranger 0.9.9 * Add 'order' mode for unordered factors for GenABEL SNP data (binary classification and regression) -##### Version 0.9.8 +# ranger 0.9.8 * Bug fixes -##### Version 0.9.7 +# ranger 0.9.7 * Change license of C++ core to MIT (R package is still GPL3) -##### Version 0.9.6 +# ranger 0.9.6 * Better 'order' mode for unordered factors for multiclass and survival -##### Version 0.9.5 +# ranger 0.9.5 * Bug fixes -##### Version 0.9.4 +# ranger 0.9.4 * Add class-weighted Gini splitting -##### Version 0.9.3 +# ranger 0.9.3 * Bug fixes -##### Version 0.9.2 +# ranger 0.9.2 * Add fixed proportion sampling -##### Version 0.9.1 +# ranger 0.9.1 * Bug fixes -##### Version 0.9.0 +# ranger 0.9.0 * New CRAN version -##### Version 0.8.5 +# ranger 0.8.5 * Faster aggregation of predictions * Fix memory issues on Windows 7 * Add treeInfo() function to extract human readable tree structure -##### Version 0.8.4 +# ranger 0.8.4 * Add quantile prediction as in quantile regression forests -##### Version 0.8.3 +# ranger 0.8.3 * Add standard error estimation with the infinitesimal jackknife (now the default) -##### Version 0.8.2 +# ranger 0.8.2 * Add bias-corrected impurity importance (actual impurity reduction, AIR) * Add impurity importance for survival forests -##### Version 0.8.1 +# ranger 0.8.1 * Bug fixes -##### Version 0.8.0 +# ranger 0.8.0 * New CRAN version -##### Version 0.7.2 +# ranger 0.7.2 * Handle sparse data of class Matrix::dgCMatrix * Add prediction of standard errors to predict() -##### Version 0.7.1 +# ranger 0.7.1 * Allow devtools::install_github() without subdir and on Windows * Bug fixes -##### Version 0.7.0 +# ranger 0.7.0 * New CRAN version -##### Version 0.6.7 +# ranger 0.6.7 * Improvements in holdoutRF and importance p-value estimation -##### Version 0.6.6 +# ranger 0.6.6 * Split at mid-point between candidate values -##### Version 0.6.5 +# ranger 0.6.5 * Better formula interface: Support interactions terms and faster computation -##### Version 0.6.4 +# ranger 0.6.4 * Add randomized splitting (extraTrees) -##### Version 0.6.3 +# ranger 0.6.3 * Bug fixes -##### Version 0.6.2 +# ranger 0.6.2 * Drop unused factor levels in outcome before growing * Add predict.all for probability and survival prediction -##### Version 0.6.1 +# ranger 0.6.1 * Bug fixes -##### Version 0.6.0 +# ranger 0.6.0 * New CRAN version -##### Version 0.5.6 +# ranger 0.5.6 * Faster version of getTerminalNodeIDs(), included in predict() -##### Version 0.5.5 +# ranger 0.5.5 * Handle new factor levels in 'order' mode * Bug fixes -##### Version 0.5.4 +# ranger 0.5.4 * Set write.forest=TRUE by default * Add num.trees option to predict() * Bug fixes -##### Version 0.5.3 +# ranger 0.5.3 * Bug fixes -##### Version 0.5.2 +# ranger 0.5.2 * Use unadjusted p-value for 2 categories in maxstat splitting -##### Version 0.5.1 +# ranger 0.5.1 * Bug fixes -##### Version 0.5.0 +# ranger 0.5.0 * New CRAN version -##### Version 0.4.7 +# ranger 0.4.7 * Add splitting by maximally selected rank statistics for regression forests -##### Version 0.4.6 +# ranger 0.4.6 * Bug fixes -##### Version 0.4.5 +# ranger 0.4.5 * Use faster method for unordered factor splitting -##### Version 0.4.4 +# ranger 0.4.4 * Add p-values for variable importance * Bug fixes -##### Version 0.4.3 +# ranger 0.4.3 * Add splitting by maximally selected rank statistics for survival forests * Bug fixes -##### Version 0.4.2 +# ranger 0.4.2 * Add Windows multithreading support for new toolchain -##### Version 0.4.1 +# ranger 0.4.1 * Runtime improvement for regression forests on classification data -##### Version 0.4.0 +# ranger 0.4.0 * New CRAN version. New CRAN versions will be 0.x.0, development versions 0.x.y -##### Version 0.3.9 +# ranger 0.3.9 * Reduce memory usage of savest forest objects (changed child.nodeIDs interface) -##### Version 0.3.8 +# ranger 0.3.8 * Remove tuning functions, please use mlr or caret -##### Version 0.3.7 +# ranger 0.3.7 * Fix bug with alternative interface and prediction * Small fixes -##### Version 0.3.6 +# ranger 0.3.6 * Add keep.inbag option to track in-bag counts * Add option sample.fraction for fraction of sampled observations -##### Version 0.3.5 +# ranger 0.3.5 * Add tree-wise split.select.weights -##### Version 0.3.4 +# ranger 0.3.4 * Add predict.all option in predict() to get individual predictions for each tree for classification and regression * Small changes in documentation -##### Version 0.3.3 +# ranger 0.3.3 * Add case-specific random forests -##### Version 0.3.2 +# ranger 0.3.2 * Add case weights (weighted bootstrapping or subsampling) -##### Version 0.3.1 +# ranger 0.3.1 * Catch error of outdated gcc not supporting C++11 completely -##### Version 0.3.0 +# ranger 0.3.0 * Allow the user to interrupt computation from R * Transpose classification.table and rename to confusion.matrix * Respect R seed for prediction @@ -253,61 +252,61 @@ * Fix bug: Probability prediction for single observations * Fix bug: Results not identical when using alternative interface -##### Version 0.2.7 +# ranger 0.2.7 * Small fixes for Solaris compiler -##### Version 0.2.6 +# ranger 0.2.6 * Add C-index splitting * Fix NA SNP handling -##### Version 0.2.5 +# ranger 0.2.5 * Fix matrix and gwaa alternative survival interface * Version submitted to JSS -##### Version 0.2.4 +# ranger 0.2.4 * Small changes in documentation -##### Version 0.2.3 +# ranger 0.2.3 * Preallocate memory for splitting -##### Version 0.2.2 +# ranger 0.2.2 * Remove recursive splitting -##### Version 0.2.1 +# ranger 0.2.1 * Allow matrix as input data in R version -##### Version 0.2.0 +# ranger 0.2.0 * Fix prediction of classification forests in R -##### Version 0.1.9 +# ranger 0.1.9 * Speedup growing for continuous covariates * Add memory save option to save memory for very large datasets (but slower) * Remove memory mode option from R version since no performance gain -##### Version 0.1.8 +# ranger 0.1.8 * Fix problems when using Rcpp <0.11.4 -##### Version 0.1.7 +# ranger 0.1.7 * Add option to split on unordered categorical covariates -##### Version 0.1.6 +# ranger 0.1.6 * Optimize memory management for very large survival forests -##### Version 0.1.5 +# ranger 0.1.5 * Set required Rcpp version to 0.11.2 * Fix large $call objects when using BatchJobs * Add details and example on GenABEL usage to documentation * Minor changes to documentation -##### Version 0.1.4 +# ranger 0.1.4 * Speedup for survival forests with continuous covariates * R version: Generate seed from R. It is no longer necessary to set the seed argument in ranger calls. -##### Version 0.1.3 +# ranger 0.1.3 * Windows support for R version (without multithreading) -##### Version 0.1.2 +# ranger 0.1.2 * Speedup growing of regression and probability prediction forests * Prediction forests are now handled like regression forests: MSE used for prediction error and permutation importance From 31ff9a272a86ccc9e3733eed17e8725686c37436 Mon Sep 17 00:00:00 2001 From: Lukas Burk Date: Wed, 12 May 2021 13:17:43 +0200 Subject: [PATCH 015/111] Basic _pkgdown.yml --- _pkgdown.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index e69de29bb..829cd4a81 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -0,0 +1,20 @@ + development: + mode: auto + +url: https://imbs-hl.github.io/ranger + +home: + title: A fast implementation of random forests + description: > + Some description + +authors: + Marvin N. Wright: + href: https://wrig.de + +news: + one_page: true + cran_dates: true + # releases: + # - text: "ranger 0.12.4" + # href: link From 9425d3578dd45e13c3593794fb9a9d4bb54879f5 Mon Sep 17 00:00:00 2001 From: Lukas Burk Date: Wed, 12 May 2021 13:21:10 +0200 Subject: [PATCH 016/111] Typo --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 829cd4a81..554585bcf 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,4 +1,4 @@ - development: +development: mode: auto url: https://imbs-hl.github.io/ranger From 98700f4d1f5ac344c743aceff638d6db29c9f42a Mon Sep 17 00:00:00 2001 From: Lukas Burk Date: Wed, 12 May 2021 13:26:30 +0200 Subject: [PATCH 017/111] =?UTF-8?q?No=20https=20for=20wrig.de=20?= =?UTF-8?q?=F0=9F=99=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 554585bcf..f2ddd2c65 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -10,7 +10,7 @@ home: authors: Marvin N. Wright: - href: https://wrig.de + href: http://wrig.de news: one_page: true From 12084e98911f385e4bd273ad782af97b4e73c42f Mon Sep 17 00:00:00 2001 From: Lukas Burk Date: Thu, 13 May 2021 13:26:06 +0200 Subject: [PATCH 018/111] Use description from DESCRIPTION file --- _pkgdown.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index f2ddd2c65..ef571c884 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -4,9 +4,14 @@ development: url: https://imbs-hl.github.io/ranger home: - title: A fast implementation of random forests + title: A fast implementation of Random Forests description: > - Some description + A fast implementation of Random Forests, particularly suited for high + dimensional data. Ensembles of classification, regression, survival and + probability prediction trees are supported. Data from genome-wide association + studies can be analyzed efficiently. In addition to data frames, datasets of + class 'gwaa.data' (R package 'GenABEL') and 'dgCMatrix' (R package 'Matrix') + can be directly analyzed. authors: Marvin N. Wright: From 3461ed8d7f4d7fac16d2dcae1ceece4efda668eb Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 21 Jun 2021 21:48:16 +0200 Subject: [PATCH 019/111] fix bug with reading treetype #566 --- cpp_version/src/utility/ArgumentHandler.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/cpp_version/src/utility/ArgumentHandler.cpp b/cpp_version/src/utility/ArgumentHandler.cpp index 3ba8cd0e8..8922760aa 100644 --- a/cpp_version/src/utility/ArgumentHandler.cpp +++ b/cpp_version/src/utility/ArgumentHandler.cpp @@ -461,11 +461,25 @@ void ArgumentHandler::checkArguments() { throw std::runtime_error("Could not read from input file: " + predict + "."); } - // Do not read num_variables, num_trees and is_ordered_variable - infile.seekg(sizeof(size_t)); + // Do not read dependent variable names + uint num_dependent_variables; + infile.read((char*) &num_dependent_variables, sizeof(num_dependent_variables)); + for (size_t i = 0; i < num_dependent_variables; ++i) { + size_t length; + infile.read((char*) &length, sizeof(size_t)); + infile.ignore(length); + } + + // Do not read num_trees + infile.ignore(sizeof(size_t)); + + // Do not read is_ordered_variable size_t length; infile.read((char*) &length, sizeof(length)); - infile.seekg(4 * sizeof(size_t) + length * sizeof(bool)); + infile.ignore(length * sizeof(bool)); + + // Do not read number of variables + infile.ignore(sizeof(size_t)); // Get treetype infile.read((char*) &treetype, sizeof(treetype)); From cfc6d8729e1a9a1eb599cecbb712b8fc586927b0 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 9 Jul 2021 06:52:56 +0200 Subject: [PATCH 020/111] new version vor CRAN --- .Rbuildignore | 1 + DESCRIPTION | 6 +++--- NEWS | 12 ++---------- NEWS.md | 3 +++ cpp_version/src/version.h | 2 +- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index c8160463c..9a18dc031 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,3 +6,4 @@ ^.*\.tar.gz$ ^cpp\_version$ ^\.github$ +^revdep$ diff --git a/DESCRIPTION b/DESCRIPTION index ff46f4958..1f743459e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.12.4 -Date: 2021-05-03 +Version: 0.13.0 +Date: 2021-07-09 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high @@ -20,6 +20,6 @@ Suggests: survival, testthat Encoding: UTF-8 -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 URL: https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/NEWS b/NEWS index 0a1acc36f..747a902eb 100644 --- a/NEWS +++ b/NEWS @@ -1,17 +1,9 @@ -##### Version 0.12.4 -* Bug fixes - -##### Version 0.12.3 +##### Version 0.13 +* Faster quantile prediction * Add ... argument to ranger() * Bug fixes -##### Version 0.12.2 -* Bug fixes - -##### Version 0.12.1 -* Bug fixes - ##### Version 0.12.0 * Faster computation (in some cases) * Add local variable importance diff --git a/NEWS.md b/NEWS.md index 2cc95defc..3084e10a8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +##### Version 0.12.0 +* New CRAN version + ##### Version 0.12.4 * Bug fixes diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index f5f4f23af..26a8d2d57 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.12.4" +#define RANGER_VERSION "0.13.0" #endif From 8aab3526a5f1ceb3ee6fa16a704ae2b4d66900ca Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 14 Jul 2021 06:11:12 +0200 Subject: [PATCH 021/111] update date --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1f743459e..3e256fee3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests Version: 0.13.0 -Date: 2021-07-09 +Date: 2021-07-14 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high From 2fb93d780e48139703789f2c70b165fa4b8bd19f Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 14 Jul 2021 10:31:46 +0200 Subject: [PATCH 022/111] fix https urls --- DESCRIPTION | 2 +- R/predict.R | 6 +++--- R/ranger.R | 2 +- cpp_version/src/version.h | 2 +- man/predict.ranger.Rd | 4 ++-- man/predict.ranger.forest.Rd | 2 +- man/ranger.Rd | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3e256fee3..6091b2904 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.13.0 +Version: 0.13.1 Date: 2021-07-14 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright diff --git a/R/predict.R b/R/predict.R index 9eebd74ec..4c200b4f3 100644 --- a/R/predict.R +++ b/R/predict.R @@ -63,7 +63,7 @@ ##' @references ##' \itemize{ ##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. -##' \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{http://jmlr.org/papers/v15/wager14a.html}. +##' \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. ##' } ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright @@ -477,8 +477,8 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, ##' @references ##' \itemize{ ##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. -##' \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{http://jmlr.org/papers/v15/wager14a.html}. -##' \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{http://www.jmlr.org/papers/v7/meinshausen06a.html}. +##' \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. +##' \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. ##' } ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright diff --git a/R/ranger.R b/R/ranger.R index ad3d2d74d..8d22b0e44 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -196,7 +196,7 @@ ##' \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med 51:74-81. \url{https://doi.org/10.3414/ME00-01-0052}. ##' \item Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical Learning. Springer, New York. 2nd edition. ##' \item Geurts, P., Ernst, D., Wehenkel, L. (2006). Extremely randomized trees. Mach Learn 63:3-42. \url{https://doi.org/10.1007/s10994-006-6226-1}. -##' \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{http://www.jmlr.org/papers/v7/meinshausen06a.html}. +##' \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. ##' \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \url{https://doi.org/10.1198/106186008X344522}. ##' \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \url{https://doi.org/10.1023/A:1009869804967}. ##' \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \url{https://doi.org/10.1109/IJCNN.2012.6252640}. diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 26a8d2d57..32040faf3 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.13.0" +#define RANGER_VERSION "0.13.1" #endif diff --git a/man/predict.ranger.Rd b/man/predict.ranger.Rd index d096fee9a..4f94201ff 100644 --- a/man/predict.ranger.Rd +++ b/man/predict.ranger.Rd @@ -96,8 +96,8 @@ pred$predictions \references{ \itemize{ \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. - \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{http://jmlr.org/papers/v15/wager14a.html}. - \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{http://www.jmlr.org/papers/v7/meinshausen06a.html}. + \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. + \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. } } \seealso{ diff --git a/man/predict.ranger.forest.Rd b/man/predict.ranger.forest.Rd index 2d4d9ad28..32e01b4a1 100644 --- a/man/predict.ranger.forest.Rd +++ b/man/predict.ranger.forest.Rd @@ -70,7 +70,7 @@ To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \co \references{ \itemize{ \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. - \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{http://jmlr.org/papers/v15/wager14a.html}. + \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. } } \seealso{ diff --git a/man/ranger.Rd b/man/ranger.Rd index 727ed5684..89a98c2b9 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -259,7 +259,7 @@ ranger(trait ~ ., data = dat.gwaa) \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med 51:74-81. \url{https://doi.org/10.3414/ME00-01-0052}. \item Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical Learning. Springer, New York. 2nd edition. \item Geurts, P., Ernst, D., Wehenkel, L. (2006). Extremely randomized trees. Mach Learn 63:3-42. \url{https://doi.org/10.1007/s10994-006-6226-1}. - \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{http://www.jmlr.org/papers/v7/meinshausen06a.html}. + \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \url{https://doi.org/10.1198/106186008X344522}. \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \url{https://doi.org/10.1023/A:1009869804967}. \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \url{https://doi.org/10.1109/IJCNN.2012.6252640}. From 9916f1e77ed1732d15a3e0b143da0468fbbe5e81 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 14 Jul 2021 11:15:23 +0200 Subject: [PATCH 023/111] add github action for CRAN url checks --- .github/workflows/R-CMD-check-cran.yaml | 78 +++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 .github/workflows/R-CMD-check-cran.yaml diff --git a/.github/workflows/R-CMD-check-cran.yaml b/.github/workflows/R-CMD-check-cran.yaml new file mode 100644 index 000000000..dc1a86b9b --- /dev/null +++ b/.github/workflows/R-CMD-check-cran.yaml @@ -0,0 +1,78 @@ +# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. +# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions +on: workflow_dispatch + +name: R-CMD-check-cran + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: windows-latest, r: 'devel'} + - {os: macOS-latest, r: 'devel'} + - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + with: + r-version: ${{ matrix.config.r }} + + - uses: r-lib/actions/setup-pandoc@v1 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + if: runner.os != 'Windows' + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_: true + _R_CHECK_CRAN_INCOMING_REMOTE_: true + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@main + with: + name: ${{ runner.os }}-r${{ matrix.config.r }}-results + path: check From c68c896ecee71ff6f376cebc558c3686932a0677 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 14 Jul 2021 11:49:28 +0200 Subject: [PATCH 024/111] merge with cran checks From c4f4276e40673c7b9675723e9eace465869df50a Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 14 Jul 2021 12:03:46 +0200 Subject: [PATCH 025/111] fix DOI links --- R/csrf.R | 2 +- R/holdoutRF.R | 2 +- R/importance.R | 2 +- R/predict.R | 4 ++-- R/ranger.R | 22 +++++++++++----------- R/utility.R | 2 +- man/csrf.Rd | 2 +- man/holdoutRF.Rd | 2 +- man/importance_pvalues.Rd | 2 +- man/predict.ranger.Rd | 2 +- man/predict.ranger.forest.Rd | 2 +- man/ranger.Rd | 22 +++++++++++----------- 12 files changed, 33 insertions(+), 33 deletions(-) diff --git a/R/csrf.R b/R/csrf.R index 758dbe6dc..ad7c1a1a6 100644 --- a/R/csrf.R +++ b/R/csrf.R @@ -62,7 +62,7 @@ ##' ##' @author Marvin N. Wright ##' @references -##' Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \url{https://doi.org/10.1080/10618600.2014.983641}. +##' Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \doi{10.1080/10618600.2014.983641}. ##' @export csrf <- function(formula, training_data, test_data, params1 = list(), params2 = list(), verbose = FALSE) { ## Grow a random forest on the training data to obtain weights diff --git a/R/holdoutRF.R b/R/holdoutRF.R index cec5b634b..261086712 100644 --- a/R/holdoutRF.R +++ b/R/holdoutRF.R @@ -36,7 +36,7 @@ ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @references -##' Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \url{https://doi.org/10.1007/s11634-016-0276-4}. \cr +##' Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \doi{10.1007/s11634-016-0276-4}. \cr ##' @export holdoutRF <- function(...) { diff --git a/R/importance.R b/R/importance.R index a7328d07d..8ae436c5b 100644 --- a/R/importance.R +++ b/R/importance.R @@ -91,7 +91,7 @@ importance.ranger <- function(x, ...) { ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @references -##' Janitza, S., Celik, E. & Boulesteix, A.-L., (2016). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \url{https://doi.org/10.1007/s11634-016-0276-4}. \cr +##' Janitza, S., Celik, E. & Boulesteix, A.-L., (2016). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \doi{10.1007/s11634-016-0276-4}. \cr ##' Altmann, A., Tolosi, L., Sander, O. & Lengauer, T. (2010). Permutation importance: a corrected feature importance measure, Bioinformatics 26:1340-1347. ##' @export importance_pvalues <- function(x, method = c("janitza", "altmann"), num.permutations = 100, formula = NULL, data = NULL, ...) { diff --git a/R/predict.R b/R/predict.R index 4c200b4f3..1c06d8cfc 100644 --- a/R/predict.R +++ b/R/predict.R @@ -62,7 +62,7 @@ ##' } ##' @references ##' \itemize{ -##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. +##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. ##' \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. ##' } ##' @seealso \code{\link{ranger}} @@ -476,7 +476,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, ##' ##' @references ##' \itemize{ -##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. +##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. ##' \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. ##' \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. ##' } diff --git a/R/ranger.R b/R/ranger.R index 8d22b0e44..c80021897 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -187,19 +187,19 @@ ##' @author Marvin N. Wright ##' @references ##' \itemize{ -##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A fast implementation of random forests for high dimensional data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. -##' \item Schmid, M., Wright, M. N. & Ziegler, A. (2016). On the use of Harrell's C for clinical risk prediction via random survival forests. Expert Syst Appl 63:450-459. \url{https://doi.org/10.1016/j.eswa.2016.07.018}. -##' \item Wright, M. N., Dankowski, T. & Ziegler, A. (2017). Unbiased split variable selection for random survival forests using maximally selected rank statistics. Stat Med 36:1272-1284. \url{https://doi.org/10.1002/sim.7212}. -##' \item Nembrini, S., Koenig, I. R. & Wright, M. N. (2018). The revival of the Gini Importance? Bioinformatics. \url{https://doi.org/10.1093/bioinformatics/bty373}. -##' \item Breiman, L. (2001). Random forests. Mach Learn, 45:5-32. \url{https://doi.org/10.1023/A:1010933404324}. -##' \item Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests. Ann Appl Stat 2:841-860. \url{https://doi.org/10.1097/JTO.0b013e318233d835}. -##' \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med 51:74-81. \url{https://doi.org/10.3414/ME00-01-0052}. +##' \item Wright, M. N. & Ziegler, A. (2017). ranger: A fast implementation of random forests for high dimensional data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. +##' \item Schmid, M., Wright, M. N. & Ziegler, A. (2016). On the use of Harrell's C for clinical risk prediction via random survival forests. Expert Syst Appl 63:450-459. \doi{10.1016/j.eswa.2016.07.018}. +##' \item Wright, M. N., Dankowski, T. & Ziegler, A. (2017). Unbiased split variable selection for random survival forests using maximally selected rank statistics. Stat Med 36:1272-1284. \doi{10.1002/sim.7212}. +##' \item Nembrini, S., Koenig, I. R. & Wright, M. N. (2018). The revival of the Gini Importance? Bioinformatics. \doi{10.1093/bioinformatics/bty373}. +##' \item Breiman, L. (2001). Random forests. Mach Learn, 45:5-32. \doi{10.1023/A:1010933404324}. +##' \item Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests. Ann Appl Stat 2:841-860. \doi{10.1097/JTO.0b013e318233d835}. +##' \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med 51:74-81. \doi{10.3414/ME00-01-0052}. ##' \item Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical Learning. Springer, New York. 2nd edition. -##' \item Geurts, P., Ernst, D., Wehenkel, L. (2006). Extremely randomized trees. Mach Learn 63:3-42. \url{https://doi.org/10.1007/s10994-006-6226-1}. +##' \item Geurts, P., Ernst, D., Wehenkel, L. (2006). Extremely randomized trees. Mach Learn 63:3-42. \doi{10.1007/s10994-006-6226-1}. ##' \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. -##' \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \url{https://doi.org/10.1198/106186008X344522}. -##' \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \url{https://doi.org/10.1023/A:1009869804967}. -##' \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \url{https://doi.org/10.1109/IJCNN.2012.6252640}. +##' \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \doi{10.1198/106186008X344522}. +##' \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \doi{10.1023/A:1009869804967}. +##' \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \doi{10.1109/IJCNN.2012.6252640}. ##' } ##' @seealso \code{\link{predict.ranger}} ##' @useDynLib ranger, .registration = TRUE diff --git a/R/utility.R b/R/utility.R index 9fe1fed6c..3d83208ee 100644 --- a/R/utility.R +++ b/R/utility.R @@ -38,7 +38,7 @@ save.sample <- function(x, ...) { } # Order factor levels with PCA approach -# Reference: Coppersmith, D., Hong, S.J. & Hosking, J.R. (1999) Partitioning Nominal Attributes in Decision Trees. Data Min Knowl Discov 3:197. \url{https://doi.org/10.1023/A:1009869804967}. +# Reference: Coppersmith, D., Hong, S.J. & Hosking, J.R. (1999) Partitioning Nominal Attributes in Decision Trees. Data Min Knowl Discov 3:197. \doi{10.1023/A:1009869804967}. pca.order <- function(y, x) { x <- droplevels(x) if (nlevels(x) < 2) { diff --git a/man/csrf.Rd b/man/csrf.Rd index 1a7617388..32e86b913 100644 --- a/man/csrf.Rd +++ b/man/csrf.Rd @@ -56,7 +56,7 @@ csrf(Species ~ ., training_data = iris.train, test_data = iris.test, } \references{ -Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \url{https://doi.org/10.1080/10618600.2014.983641}. +Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \doi{10.1080/10618600.2014.983641}. } \author{ Marvin N. Wright diff --git a/man/holdoutRF.Rd b/man/holdoutRF.Rd index cc651ef2f..3c309f71a 100644 --- a/man/holdoutRF.Rd +++ b/man/holdoutRF.Rd @@ -18,7 +18,7 @@ Instead of out-of-bag data, the other fold is used to compute permutation import Related to the novel permutation variable importance by Janitza et al. (2015). } \references{ -Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \url{https://doi.org/10.1007/s11634-016-0276-4}. \cr +Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \doi{10.1007/s11634-016-0276-4}. \cr } \seealso{ \code{\link{ranger}} diff --git a/man/importance_pvalues.Rd b/man/importance_pvalues.Rd index d16ee323a..1cf54a746 100644 --- a/man/importance_pvalues.Rd +++ b/man/importance_pvalues.Rd @@ -63,7 +63,7 @@ importance_pvalues(rf.iris, method = "altmann", formula = Species ~ ., data = ir } } \references{ -Janitza, S., Celik, E. & Boulesteix, A.-L., (2016). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \url{https://doi.org/10.1007/s11634-016-0276-4}. \cr +Janitza, S., Celik, E. & Boulesteix, A.-L., (2016). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \doi{10.1007/s11634-016-0276-4}. \cr Altmann, A., Tolosi, L., Sander, O. & Lengauer, T. (2010). Permutation importance: a corrected feature importance measure, Bioinformatics 26:1340-1347. } \seealso{ diff --git a/man/predict.ranger.Rd b/man/predict.ranger.Rd index 4f94201ff..362befcae 100644 --- a/man/predict.ranger.Rd +++ b/man/predict.ranger.Rd @@ -95,7 +95,7 @@ pred$predictions } \references{ \itemize{ - \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. + \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. } diff --git a/man/predict.ranger.forest.Rd b/man/predict.ranger.forest.Rd index 32e01b4a1..ba018b0e3 100644 --- a/man/predict.ranger.forest.Rd +++ b/man/predict.ranger.forest.Rd @@ -69,7 +69,7 @@ To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \co } \references{ \itemize{ - \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. + \item Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. \item Wager, S., Hastie T., & Efron, B. (2014). Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife. J Mach Learn Res 15:1625-1651. \url{https://jmlr.org/papers/v15/wager14a.html}. } } diff --git a/man/ranger.Rd b/man/ranger.Rd index 89a98c2b9..410cd7fe5 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -250,19 +250,19 @@ ranger(trait ~ ., data = dat.gwaa) } \references{ \itemize{ - \item Wright, M. N. & Ziegler, A. (2017). ranger: A fast implementation of random forests for high dimensional data in C++ and R. J Stat Softw 77:1-17. \url{https://doi.org/10.18637/jss.v077.i01}. - \item Schmid, M., Wright, M. N. & Ziegler, A. (2016). On the use of Harrell's C for clinical risk prediction via random survival forests. Expert Syst Appl 63:450-459. \url{https://doi.org/10.1016/j.eswa.2016.07.018}. - \item Wright, M. N., Dankowski, T. & Ziegler, A. (2017). Unbiased split variable selection for random survival forests using maximally selected rank statistics. Stat Med 36:1272-1284. \url{https://doi.org/10.1002/sim.7212}. - \item Nembrini, S., Koenig, I. R. & Wright, M. N. (2018). The revival of the Gini Importance? Bioinformatics. \url{https://doi.org/10.1093/bioinformatics/bty373}. - \item Breiman, L. (2001). Random forests. Mach Learn, 45:5-32. \url{https://doi.org/10.1023/A:1010933404324}. - \item Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests. Ann Appl Stat 2:841-860. \url{https://doi.org/10.1097/JTO.0b013e318233d835}. - \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med 51:74-81. \url{https://doi.org/10.3414/ME00-01-0052}. + \item Wright, M. N. & Ziegler, A. (2017). ranger: A fast implementation of random forests for high dimensional data in C++ and R. J Stat Softw 77:1-17. \doi{10.18637/jss.v077.i01}. + \item Schmid, M., Wright, M. N. & Ziegler, A. (2016). On the use of Harrell's C for clinical risk prediction via random survival forests. Expert Syst Appl 63:450-459. \doi{10.1016/j.eswa.2016.07.018}. + \item Wright, M. N., Dankowski, T. & Ziegler, A. (2017). Unbiased split variable selection for random survival forests using maximally selected rank statistics. Stat Med 36:1272-1284. \doi{10.1002/sim.7212}. + \item Nembrini, S., Koenig, I. R. & Wright, M. N. (2018). The revival of the Gini Importance? Bioinformatics. \doi{10.1093/bioinformatics/bty373}. + \item Breiman, L. (2001). Random forests. Mach Learn, 45:5-32. \doi{10.1023/A:1010933404324}. + \item Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests. Ann Appl Stat 2:841-860. \doi{10.1097/JTO.0b013e318233d835}. + \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med 51:74-81. \doi{10.3414/ME00-01-0052}. \item Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical Learning. Springer, New York. 2nd edition. - \item Geurts, P., Ernst, D., Wehenkel, L. (2006). Extremely randomized trees. Mach Learn 63:3-42. \url{https://doi.org/10.1007/s10994-006-6226-1}. + \item Geurts, P., Ernst, D., Wehenkel, L. (2006). Extremely randomized trees. Mach Learn 63:3-42. \doi{10.1007/s10994-006-6226-1}. \item Meinshausen (2006). Quantile Regression Forests. J Mach Learn Res 7:983-999. \url{https://www.jmlr.org/papers/v7/meinshausen06a.html}. - \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \url{https://doi.org/10.1198/106186008X344522}. - \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \url{https://doi.org/10.1023/A:1009869804967}. - \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \url{https://doi.org/10.1109/IJCNN.2012.6252640}. + \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \doi{10.1198/106186008X344522}. + \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \doi{10.1023/A:1009869804967}. + \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \doi{10.1109/IJCNN.2012.6252640}. } } \seealso{ From b07f5bcb6250fb7e4089092316ba2ac70b10e911 Mon Sep 17 00:00:00 2001 From: SvenVw <37927107+SvenVw@users.noreply.github.com> Date: Fri, 16 Jul 2021 10:49:04 +0200 Subject: [PATCH 026/111] Fix typo of paralellization -> parallelization Hi, just a small fix for a little typo --- R/ranger.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index c80021897..bebe3ea21 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -539,10 +539,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, regularization.factor <- c(0, 0) use.regularization.factor <- FALSE } else { - # Deactivation of paralellization + # Deactivation of parallelization if (num.threads != 1) { num.threads <- 1 - warning("Paralellization deactivated (regularization used).") + warning("Parallelization deactivated (regularization used).") } use.regularization.factor <- TRUE } From a6c0d4135021afe09be81f848add0cbd6ddebb1d Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 20 Jul 2021 16:27:26 +0200 Subject: [PATCH 027/111] fix another parallelization typo --- cpp_version/src/utility/ArgumentHandler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp_version/src/utility/ArgumentHandler.cpp b/cpp_version/src/utility/ArgumentHandler.cpp index 8922760aa..761b23448 100644 --- a/cpp_version/src/utility/ArgumentHandler.cpp +++ b/cpp_version/src/utility/ArgumentHandler.cpp @@ -534,7 +534,7 @@ void ArgumentHandler::checkArguments() { } if (nthreads != 1) { - std::cout << "Warning: Paralellization deactivated (regularization used)." << std::endl; + std::cout << "Warning: Parallelization deactivated (regularization used)." << std::endl; nthreads = 1; } } From 427763fe299df65b04d005ca83fe9b5f585586d6 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 30 Jul 2021 07:44:29 +0200 Subject: [PATCH 028/111] fix NEWS.md for pkgdown --- NEWS.md | 186 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3084e10a8..880e3d868 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,254 +1,254 @@ -##### Version 0.12.0 +# ranger 0.13.0 * New CRAN version -##### Version 0.12.4 +# ranger 0.12.4 * Bug fixes -##### Version 0.12.3 +# ranger 0.12.3 * Add ... argument to ranger() * Bug fixes -##### Version 0.12.2 +# ranger 0.12.2 * Bug fixes -##### Version 0.12.1 +# ranger 0.12.1 * Bug fixes -##### Version 0.12.0 +# ranger 0.12.0 * New CRAN version -##### Version 0.11.8 +# ranger 0.11.8 * Add regularization * Faster computation (in some cases) -##### Version 0.11.7 +# ranger 0.11.7 * Add local variable importance -##### Version 0.11.6 +# ranger 0.11.6 * Add "hellinger" splitrule for binary classification -##### Version 0.11.5 +# ranger 0.11.5 * Add x/y interface * Internal changes (seed differences possible, prediction incompatible with older versions) -##### Version 0.11.4 +# ranger 0.11.4 * Add "beta" splitrule for bounded outcomes -##### Version 0.11.3 +# ranger 0.11.3 * Accept user-specified function in quantile prediction -##### Version 0.11.2 +# ranger 0.11.2 * Bug fixes -##### Version 0.11.1 +# ranger 0.11.1 * Bug fixes -##### Version 0.11.0 +# ranger 0.11.0 * New CRAN version -##### Version 0.10.6 +# ranger 0.10.6 * Internal changes (slightly improved computation speed) * Warning: Possible seed differences compared to older versions * Bug fixes -##### Version 0.10.5 +# ranger 0.10.5 * Add support of splitting weights for corrected impurity importance * Bug fixes -##### Version 0.10.4 +# ranger 0.10.4 * Add inbag argument for manual selection of observations in trees -##### Version 0.10.3 +# ranger 0.10.3 * Bug fixes -##### Version 0.10.2 +# ranger 0.10.2 * Add max.depth parameter to limit tree depth -##### Version 0.10.1 +# ranger 0.10.1 * Bug fixes -##### Version 0.10.0 +# ranger 0.10.0 * New CRAN version -##### Version 0.9.12 +# ranger 0.9.12 * Remove GenABEL from suggested packages (removed from CRAN). GenABEL data is still supported -##### Version 0.9.11 +# ranger 0.9.11 * Improve memory management (internal changes) -##### Version 0.9.10 +# ranger 0.9.10 * Add impurity importance for the maxstat splitting rule * Bug fixes -##### Version 0.9.9 +# ranger 0.9.9 * Add 'order' mode for unordered factors for GenABEL SNP data (binary classification and regression) -##### Version 0.9.8 +# ranger 0.9.8 * Bug fixes -##### Version 0.9.7 +# ranger 0.9.7 * Change license of C++ core to MIT (R package is still GPL3) -##### Version 0.9.6 +# ranger 0.9.6 * Better 'order' mode for unordered factors for multiclass and survival -##### Version 0.9.5 +# ranger 0.9.5 * Bug fixes -##### Version 0.9.4 +# ranger 0.9.4 * Add class-weighted Gini splitting -##### Version 0.9.3 +# ranger 0.9.3 * Bug fixes -##### Version 0.9.2 +# ranger 0.9.2 * Add fixed proportion sampling -##### Version 0.9.1 +# ranger 0.9.1 * Bug fixes -##### Version 0.9.0 +# ranger 0.9.0 * New CRAN version -##### Version 0.8.5 +# ranger 0.8.5 * Faster aggregation of predictions * Fix memory issues on Windows 7 * Add treeInfo() function to extract human readable tree structure -##### Version 0.8.4 +# ranger 0.8.4 * Add quantile prediction as in quantile regression forests -##### Version 0.8.3 +# ranger 0.8.3 * Add standard error estimation with the infinitesimal jackknife (now the default) -##### Version 0.8.2 +# ranger 0.8.2 * Add bias-corrected impurity importance (actual impurity reduction, AIR) * Add impurity importance for survival forests -##### Version 0.8.1 +# ranger 0.8.1 * Bug fixes -##### Version 0.8.0 +# ranger 0.8.0 * New CRAN version -##### Version 0.7.2 +# ranger 0.7.2 * Handle sparse data of class Matrix::dgCMatrix * Add prediction of standard errors to predict() -##### Version 0.7.1 +# ranger 0.7.1 * Allow devtools::install_github() without subdir and on Windows * Bug fixes -##### Version 0.7.0 +# ranger 0.7.0 * New CRAN version -##### Version 0.6.7 +# ranger 0.6.7 * Improvements in holdoutRF and importance p-value estimation -##### Version 0.6.6 +# ranger 0.6.6 * Split at mid-point between candidate values -##### Version 0.6.5 +# ranger 0.6.5 * Better formula interface: Support interactions terms and faster computation -##### Version 0.6.4 +# ranger 0.6.4 * Add randomized splitting (extraTrees) -##### Version 0.6.3 +# ranger 0.6.3 * Bug fixes -##### Version 0.6.2 +# ranger 0.6.2 * Drop unused factor levels in outcome before growing * Add predict.all for probability and survival prediction -##### Version 0.6.1 +# ranger 0.6.1 * Bug fixes -##### Version 0.6.0 +# ranger 0.6.0 * New CRAN version -##### Version 0.5.6 +# ranger 0.5.6 * Faster version of getTerminalNodeIDs(), included in predict() -##### Version 0.5.5 +# ranger 0.5.5 * Handle new factor levels in 'order' mode * Bug fixes -##### Version 0.5.4 +# ranger 0.5.4 * Set write.forest=TRUE by default * Add num.trees option to predict() * Bug fixes -##### Version 0.5.3 +# ranger 0.5.3 * Bug fixes -##### Version 0.5.2 +# ranger 0.5.2 * Use unadjusted p-value for 2 categories in maxstat splitting -##### Version 0.5.1 +# ranger 0.5.1 * Bug fixes -##### Version 0.5.0 +# ranger 0.5.0 * New CRAN version -##### Version 0.4.7 +# ranger 0.4.7 * Add splitting by maximally selected rank statistics for regression forests -##### Version 0.4.6 +# ranger 0.4.6 * Bug fixes -##### Version 0.4.5 +# ranger 0.4.5 * Use faster method for unordered factor splitting -##### Version 0.4.4 +# ranger 0.4.4 * Add p-values for variable importance * Bug fixes -##### Version 0.4.3 +# ranger 0.4.3 * Add splitting by maximally selected rank statistics for survival forests * Bug fixes -##### Version 0.4.2 +# ranger 0.4.2 * Add Windows multithreading support for new toolchain -##### Version 0.4.1 +# ranger 0.4.1 * Runtime improvement for regression forests on classification data -##### Version 0.4.0 +# ranger 0.4.0 * New CRAN version. New CRAN versions will be 0.x.0, development versions 0.x.y -##### Version 0.3.9 +# ranger 0.3.9 * Reduce memory usage of savest forest objects (changed child.nodeIDs interface) -##### Version 0.3.8 +# ranger 0.3.8 * Remove tuning functions, please use mlr or caret -##### Version 0.3.7 +# ranger 0.3.7 * Fix bug with alternative interface and prediction * Small fixes -##### Version 0.3.6 +# ranger 0.3.6 * Add keep.inbag option to track in-bag counts * Add option sample.fraction for fraction of sampled observations -##### Version 0.3.5 +# ranger 0.3.5 * Add tree-wise split.select.weights -##### Version 0.3.4 +# ranger 0.3.4 * Add predict.all option in predict() to get individual predictions for each tree for classification and regression * Small changes in documentation -##### Version 0.3.3 +# ranger 0.3.3 * Add case-specific random forests -##### Version 0.3.2 +# ranger 0.3.2 * Add case weights (weighted bootstrapping or subsampling) -##### Version 0.3.1 +# ranger 0.3.1 * Catch error of outdated gcc not supporting C++11 completely -##### Version 0.3.0 +# ranger 0.3.0 * Allow the user to interrupt computation from R * Transpose classification.table and rename to confusion.matrix * Respect R seed for prediction @@ -256,61 +256,61 @@ * Fix bug: Probability prediction for single observations * Fix bug: Results not identical when using alternative interface -##### Version 0.2.7 +# ranger 0.2.7 * Small fixes for Solaris compiler -##### Version 0.2.6 +# ranger 0.2.6 * Add C-index splitting * Fix NA SNP handling -##### Version 0.2.5 +# ranger 0.2.5 * Fix matrix and gwaa alternative survival interface * Version submitted to JSS -##### Version 0.2.4 +# ranger 0.2.4 * Small changes in documentation -##### Version 0.2.3 +# ranger 0.2.3 * Preallocate memory for splitting -##### Version 0.2.2 +# ranger 0.2.2 * Remove recursive splitting -##### Version 0.2.1 +# ranger 0.2.1 * Allow matrix as input data in R version -##### Version 0.2.0 +# ranger 0.2.0 * Fix prediction of classification forests in R -##### Version 0.1.9 +# ranger 0.1.9 * Speedup growing for continuous covariates * Add memory save option to save memory for very large datasets (but slower) * Remove memory mode option from R version since no performance gain -##### Version 0.1.8 +# ranger 0.1.8 * Fix problems when using Rcpp <0.11.4 -##### Version 0.1.7 +# ranger 0.1.7 * Add option to split on unordered categorical covariates -##### Version 0.1.6 +# ranger 0.1.6 * Optimize memory management for very large survival forests -##### Version 0.1.5 +# ranger 0.1.5 * Set required Rcpp version to 0.11.2 * Fix large $call objects when using BatchJobs * Add details and example on GenABEL usage to documentation * Minor changes to documentation -##### Version 0.1.4 +# ranger 0.1.4 * Speedup for survival forests with continuous covariates * R version: Generate seed from R. It is no longer necessary to set the seed argument in ranger calls. -##### Version 0.1.3 +# ranger 0.1.3 * Windows support for R version (without multithreading) -##### Version 0.1.2 +# ranger 0.1.2 * Speedup growing of regression and probability prediction forests * Prediction forests are now handled like regression forests: MSE used for prediction error and permutation importance From 2cedf125c5a5aac825d03a67f420956a9309515b Mon Sep 17 00:00:00 2001 From: bgreenwell Date: Fri, 30 Jul 2021 15:16:07 -0400 Subject: [PATCH 029/111] start afresh and squash conflicts --- NAMESPACE | 3 + R/deforest.R | 175 +++++++++++++++++++++++++++++++++ man/deforest.Rd | 68 +++++++++++++ man/print.deforest.ranger.Rd | 29 ++++++ tests/testthat/test_deforest.R | 43 ++++++++ 5 files changed, 318 insertions(+) create mode 100644 R/deforest.R create mode 100644 man/deforest.Rd create mode 100644 man/print.deforest.ranger.Rd create mode 100644 tests/testthat/test_deforest.R diff --git a/NAMESPACE b/NAMESPACE index b8dd585b3..b27cb67bd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,16 +1,19 @@ # Generated by roxygen2: do not edit by hand +S3method(deforest,ranger) S3method(importance,ranger) S3method(predict,ranger) S3method(predict,ranger.forest) S3method(predictions,ranger) S3method(predictions,ranger.prediction) +S3method(print,deforest.ranger) S3method(print,ranger) S3method(print,ranger.forest) S3method(print,ranger.prediction) S3method(timepoints,ranger) S3method(timepoints,ranger.prediction) export(csrf) +export(deforest) export(getTerminalNodeIDs) export(holdoutRF) export(importance) diff --git a/R/deforest.R b/R/deforest.R new file mode 100644 index 000000000..dbe4bef66 --- /dev/null +++ b/R/deforest.R @@ -0,0 +1,175 @@ +#' Deforesting a random forest +#' +#' The main purpose of this function is to allow for post-processing of +#' ensembles via L2 regularized regression (i.e., the LASSO), as described in +#' Friedman and Popescu (2003). The basic idea is to use the LASSO to +#' post-process the predictions from the individual base learners in an ensemble +#' (i.e., decision trees) in the hopes of producing a much smaller model without +#' sacrificing much in the way of accuracy, and in some cases, improving it. +#' Friedman and Popescu (2003) describe conditions under which tree-based +#' ensembles, like random forest, can potentially benefit from such +#' post-processing (e.g., using shallower trees trained on much smaller samples +#' of the training data without replacement). However, the computational +#' benefits of such post-processing can only be realized if the base learners +#' "zeroed out" by the LASSO can actually be removed from the original ensemble, +#' hence the purpose of this function. A complete example using +#' \code{\link{ranger}} can be found at +#' \url{https://github.com/imbs-hl/ranger/issues/568}. +#' +#' @param object A fitted random forest (e.g., a \code{\link{ranger}} +#' object). +#' +#' @param which.trees Vector giving the indices of the trees to remove. +#' +#' @param warn Logical indicating whether or not to warn users that some of the +#' standard output of a typical \code{\link{ranger}} object or no longer +#' available after deforestation. Default is \code{TRUE}. +#' +#' @param ... Additional (optional) arguments. (Currently ignored.) +#' +#' @return An object of class \code{"deforest.ranger"}; essentially, a +#' \code{\link{ranger}} object with certain components replaced with +#' \code{NA}s (e.g., out-of-bag (OOB) predictions, variable importance scores +#' (if requested), and OOB-based error metrics). +#' +#' @note This function is a generic and can be extended by other packages. +#' +#' @references +#' Friedman, J. and Popescu, B. (2003). Importance sampled learning ensembles, +#' Technical report, Stanford University, Department of Statistics. +#' \url{https://statweb.stanford.edu/~jhf/ftp/isle.pdf}. +#' +#' @rdname deforest +#' +#' @export +#' +#' @author Brandon M. Greenwell +#' +#' @examples +#' ## Example of deforesting a random forest +#' rfo <- ranger(Species ~ ., data = iris, probability = TRUE, num.trees = 100) +#' dfo <- deforest(rfo, which.trees = c(1, 3, 5)) +#' dfo # same as `rfo` but with trees 1, 3, and 5 removed +#' +#' ## Sanity check +#' preds.rfo <- predict(rfo, data = iris, predict.all = TRUE)$predictions +#' preds.dfo <- predict(dfo, data = iris, predict.all = TRUE)$predictions +#' identical(preds.rfo[, , -c(1, 3, 5)], y = preds.dfo) +deforest <- function(object, which.trees = NULL, ...) { + UseMethod("deforest") +} + + +#' @rdname deforest +#' +#' @export +deforest.ranger <- function(object, which.trees = NULL, warn = TRUE, ...) { + + # Warn users about `predictions` and `prediction.error` components + if (isTRUE(warn)) { + warning("Many of the components of a typical \"ranger\" object are ", + "not available after deforestation and are instead replaced with ", + "`NA` (e.g., out-of-bag (OOB) predictions, variable importance ", + "scores (if requested), and OOB-based error metrics).", + call. = FALSE) + } + + # "Remove trees" by removing necessary components from `forest` object + object$forest$child.nodeIDs[which.trees] <- NULL + object$forest$split.values[which.trees] <- NULL + object$forest$split.varIDs[which.trees] <- NULL + object$forest$terminal.class.counts[which.trees] <- NULL # for prob forests + object$forest$chf[which.trees] <- NULL # for survival forests + + # Update `num.trees` components so `predict.ranger()` works + object$forest$num.trees <- object$num.trees <- + length(object$forest$child.nodeIDs) + + # Coerce other components to `NA` as needed + if (!is.null(object$prediction.error)) { + object$prediction.error <- NA + } + if (!is.null(object$predictions)) { # classification and regression + object$predictions[] <- NA + } + if (!is.null(object$r.squared)) { # regression + object$r.squared <- NA + } + if (!is.null(object$chf)) { # survival forests + object$chf[] <- NA + } + if (!is.null(object$survival)) { # survival forests + object$survival[] <- NA + } + if (object$importance.mode != "none") { # variable importance + object$importance.mode <- NA + object$variable.importance[] <- NA + } + + # Return "deforested" forest + class(object) <- c("deforest.ranger", class(object)) + object + +} + + +#' Print deforested ranger summary +#' +#' Print basic information about a deforested \code{\link{ranger}} object. +#' +#' @param x A \code{\link{deforest}} object (i.e., an object that inherits from +#' class \code{"deforest.ranger"}). +#' +#' @param ... Further arguments passed to or from other methods. +#' +#' @note Many of the components of a typical \code{\link{ranger}} object are not +#' available after deforestation and are instead replaced with \code{NA} (e.g., +#' out-of-bag (OOB) predictions, variable importance scores (if requested), and +#' OOB-based error metrics). +#' +#' @seealso \code{\link{deforest}}. +#' +#' @author Brandon M. Greenwell +#' +#' @export +print.deforest.ranger <- function (x, ...) { + cat("Ranger (deforested) result\n\n") + cat("Note that many of the components of a typical \"ranger\" object are", + "not available after deforestation and are instead replaced with `NA`", + "(e.g., out-of-bag (OOB) predictions, variable importance scores (if", + "requested), and OOB-based error metrics)", + "\n\n") + cat("Type: ", x$treetype, "\n") + cat("Number of trees: ", x$num.trees, "\n") + cat("Sample size: ", x$num.samples, "\n") + cat("Number of independent variables: ", x$num.independent.variables, "\n") + cat("Mtry: ", x$mtry, "\n") + cat("Target node size: ", x$min.node.size, "\n") + cat("Variable importance mode: ", x$importance.mode, "\n") + cat("Splitrule: ", x$splitrule, "\n") + if (x$treetype == "Survival") { + cat("Number of unique death times: ", length(x$unique.death.times), "\n") + } + if (!is.null(x$splitrule) && x$splitrule == "extratrees" && + !is.null(x$num.random.splits)) { + cat("Number of random splits: ", x$num.random.splits, "\n") + } + if (x$treetype == "Classification") { + cat("OOB prediction error: ", x$prediction.error, "\n") + } + else if (x$treetype == "Regression") { + cat("OOB prediction error (MSE): ", x$prediction.error, "\n") + } + else if (x$treetype == "Survival") { + cat("OOB prediction error (1-C): ", x$prediction.error, "\n") + } + else if (x$treetype == "Probability estimation") { + cat("OOB prediction error (Brier s.): ", x$prediction.error, "\n") + } + else { + cat("OOB prediction error: ", x$prediction.error, "\n") + } + if (x$treetype == "Regression") { + cat("R squared (OOB): ", x$r.squared, "\n") + } +} diff --git a/man/deforest.Rd b/man/deforest.Rd new file mode 100644 index 000000000..2e583a2cf --- /dev/null +++ b/man/deforest.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deforest.R +\name{deforest} +\alias{deforest} +\alias{deforest.ranger} +\title{Deforesting a random forest} +\usage{ +deforest(object, which.trees = NULL, ...) + +\method{deforest}{ranger}(object, which.trees = NULL, warn = TRUE, ...) +} +\arguments{ +\item{object}{A fitted random forest (e.g., a \code{\link{ranger}} +object).} + +\item{which.trees}{Vector giving the indices of the trees to remove.} + +\item{...}{Additional (optional) arguments. (Currently ignored.)} + +\item{warn}{Logical indicating whether or not to warn users that some of the +standard output of a typical \code{\link{ranger}} object or no longer +available after deforestation. Default is \code{TRUE}.} +} +\value{ +An object of class \code{"deforest.ranger"}; essentially, a +\code{\link{ranger}} object with certain components replaced with +\code{NA}s (e.g., out-of-bag (OOB) predictions, variable importance scores +(if requested), and OOB-based error metrics). +} +\description{ +The main purpose of this function is to allow for post-processing of +ensembles via L2 regularized regression (i.e., the LASSO), as described in +Friedman and Popescu (2003). The basic idea is to use the LASSO to +post-process the predictions from the individual base learners in an ensemble +(i.e., decision trees) in the hopes of producing a much smaller model without +sacrificing much in the way of accuracy, and in some cases, improving it. +Friedman and Popescu (2003) describe conditions under which tree-based +ensembles, like random forest, can potentially benefit from such +post-processing (e.g., using shallower trees trained on much smaller samples +of the training data without replacement). However, the computational +benefits of such post-processing can only be realized if the base learners +"zeroed out" by the LASSO can actually be removed from the original ensemble, +hence the purpose of this function. A complete example using +\code{\link{ranger}} can be found at +\url{https://github.com/imbs-hl/ranger/issues/568}. +} +\note{ +This function is a generic and can be extended by other packages. +} +\examples{ +## Example of deforesting a random forest +rfo <- ranger(Species ~ ., data = iris, probability = TRUE, num.trees = 100) +dfo <- deforest(rfo, which.trees = c(1, 3, 5)) +dfo # same as `rfo` but with trees 1, 3, and 5 removed + +## Sanity check +preds.rfo <- predict(rfo, data = iris, predict.all = TRUE)$predictions +preds.dfo <- predict(dfo, data = iris, predict.all = TRUE)$predictions +identical(preds.rfo[, , -c(1, 3, 5)], y = preds.dfo) +} +\references{ +Friedman, J. and Popescu, B. (2003). Importance sampled learning ensembles, +Technical report, Stanford University, Department of Statistics. +\url{https://statweb.stanford.edu/~jhf/ftp/isle.pdf}. +} +\author{ +Brandon M. Greenwell +} diff --git a/man/print.deforest.ranger.Rd b/man/print.deforest.ranger.Rd new file mode 100644 index 000000000..044312df9 --- /dev/null +++ b/man/print.deforest.ranger.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deforest.R +\name{print.deforest.ranger} +\alias{print.deforest.ranger} +\title{Print deforested ranger summary} +\usage{ +\method{print}{deforest.ranger}(x, ...) +} +\arguments{ +\item{x}{A \code{\link{deforest}} object (i.e., an object that inherits from +class \code{"deforest.ranger"}).} + +\item{...}{Further arguments passed to or from other methods.} +} +\description{ +Print basic information about a deforested \code{\link{ranger}} object. +} +\note{ +Many of the components of a typical \code{\link{ranger}} object are not +available after deforestation and are instead replaced with \code{NA} (e.g., +out-of-bag (OOB) predictions, variable importance scores (if requested), and +OOB-based error metrics). +} +\seealso{ +\code{\link{deforest}}. +} +\author{ +Brandon M. Greenwell +} diff --git a/tests/testthat/test_deforest.R b/tests/testthat/test_deforest.R new file mode 100644 index 000000000..16b4981ed --- /dev/null +++ b/tests/testthat/test_deforest.R @@ -0,0 +1,43 @@ +library(ranger) +library(survival) +context("ranger_deforest") + + +test_that("deforest works as expected for probability estimation", { + rfo <- ranger(Species ~ ., data = iris, num.trees = 10, probability = TRUE) + dfo <- deforest(rfo, which.trees = c(1, 3, 5), warn = FALSE) + pred.rfo <- predict(rfo, data = iris, predict.all = TRUE)$predictions + pred.dfo <- predict(dfo, data = iris, predict.all = TRUE)$predictions + expect_identical(pred.rfo[, , -c(1, 3, 5)], pred.dfo) +}) + +test_that("deforest works as expected for classification", { + rfo <- ranger(Species ~ ., data = iris, num.trees = 10) + dfo <- deforest(rfo, which.trees = c(1, 3, 5), warn = FALSE) + pred.rfo <- predict(rfo, data = iris, predict.all = TRUE)$predictions + pred.dfo <- predict(dfo, data = iris, predict.all = TRUE)$predictions + expect_identical(pred.rfo[, -c(1, 3, 5)], pred.dfo) +}) + +test_that("deforest works as expected for regression", { + n <- 50 + x <- runif(n, min = 0, max = 2*pi) + dat <- data.frame(x = x, y = sin(x) + rnorm(n, sd = 0.1)) + rfo <- ranger(y ~ ., data = dat, num.trees = 10) + dfo <- deforest(rfo, which.trees = c(1, 3, 5), warn = FALSE) + pred.rfo <- predict(rfo, data = dat, predict.all = TRUE)$predictions + pred.dfo <- predict(dfo, data = dat, predict.all = TRUE)$predictions + expect_identical(pred.rfo[, -c(1, 3, 5)], pred.dfo) +}) + +test_that("deforest works as expected for censored outcomes", { + dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), + x = rbinom(100, 1, .5)) + rfo <- ranger(Surv(time, status) ~ x, data = dat, num.trees = 10, + splitrule = "logrank") + dfo <- deforest(rfo, which.trees = c(1, 3, 5), warn = FALSE) + pred.rfo <- predict(rfo, data = dat, predict.all = TRUE) + pred.dfo <- predict(dfo, data = dat, predict.all = TRUE) + expect_identical(pred.rfo$chf[, , -c(1, 3, 5)], pred.dfo$chf) + expect_identical(pred.rfo$survival[, , -c(1, 3, 5)], pred.dfo$survival) +}) From 05c088999bd38befbc034cf2f9be480134112a8c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 17 Oct 2021 00:45:33 -0700 Subject: [PATCH 030/111] Use .find(','), not .find(","), for efficiency See https://clang.llvm.org/extra/clang-tidy/checks/performance-faster-string-find.html --- src/Data.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Data.cpp b/src/Data.cpp index accf27a23..c3f0a7161 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -68,9 +68,9 @@ bool Data::loadFromFile(std::string filename, std::vector& dependen getline(input_file, header_line); // Find out if comma, semicolon or whitespace seperated and call appropriate method - if (header_line.find(",") != std::string::npos) { + if (header_line.find(',') != std::string::npos) { result = loadFromFileOther(input_file, header_line, dependent_variable_names, ','); - } else if (header_line.find(";") != std::string::npos) { + } else if (header_line.find(';') != std::string::npos) { result = loadFromFileOther(input_file, header_line, dependent_variable_names, ';'); } else { result = loadFromFileWhitespace(input_file, header_line, dependent_variable_names); From f75e1674506bf2cbeefed4c72cab941c984f8843 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 11 Jan 2022 14:12:30 +0100 Subject: [PATCH 031/111] cross compile with posix threads --- cpp_version/cross_compile/toolchain_win32.cmake | 4 ++-- cpp_version/cross_compile/toolchain_win64.cmake | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp_version/cross_compile/toolchain_win32.cmake b/cpp_version/cross_compile/toolchain_win32.cmake index 6c0bdd753..c07342fd3 100644 --- a/cpp_version/cross_compile/toolchain_win32.cmake +++ b/cpp_version/cross_compile/toolchain_win32.cmake @@ -3,7 +3,7 @@ SET(CMAKE_SYSTEM_NAME Windows) # which compilers to use for C and C++ SET(CMAKE_C_COMPILER i686-w64-mingw32-gcc) -SET(CMAKE_CXX_COMPILER i686-w64-mingw32-g++) +SET(CMAKE_CXX_COMPILER i686-w64-mingw32-g++-posix) SET(CMAKE_RC_COMPILER i686-w64-mingw32-windres) # here is the target environment located @@ -14,4 +14,4 @@ SET(CMAKE_FIND_ROOT_PATH /usr/i686-w64-mingw32) # programs in the host environment set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) \ No newline at end of file +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/cpp_version/cross_compile/toolchain_win64.cmake b/cpp_version/cross_compile/toolchain_win64.cmake index 854e225c5..535e5b716 100644 --- a/cpp_version/cross_compile/toolchain_win64.cmake +++ b/cpp_version/cross_compile/toolchain_win64.cmake @@ -3,7 +3,7 @@ SET(CMAKE_SYSTEM_NAME Windows) # which compilers to use for C and C++ SET(CMAKE_C_COMPILER x86_64-w64-mingw32-gcc) -SET(CMAKE_CXX_COMPILER x86_64-w64-mingw32-g++) +SET(CMAKE_CXX_COMPILER x86_64-w64-mingw32-g++-posix) SET(CMAKE_RC_COMPILER x86_64-w64-mingw32-windres) # here is the target environment located @@ -14,4 +14,4 @@ SET(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32) # programs in the host environment set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) \ No newline at end of file +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) From b79829f1642fb5901783cd21cbb44bce11b7a766 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 3 Mar 2022 08:44:18 +0100 Subject: [PATCH 032/111] update version and news file with deforest function --- DESCRIPTION | 4 ++-- NEWS.md | 7 +++++++ cpp_version/src/version.h | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6091b2904..794502318 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.13.1 -Date: 2021-07-14 +Version: 0.13.2 +Date: 2022-03-03 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS.md b/NEWS.md index 880e3d868..80b45f249 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,11 @@ +# ranger 0.13.2 +* Add deforest() function to remove trees from ensemble +* Fix cross compiling for Windows + +# ranger 0.13.1 +* Fix https URLs + # ranger 0.13.0 * New CRAN version diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 32040faf3..ab3f8899c 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.13.1" +#define RANGER_VERSION "0.13.2" #endif From 4b66555304160e991fdd29e58438cdafc2fff801 Mon Sep 17 00:00:00 2001 From: RomanHornung Date: Fri, 20 May 2022 16:47:34 +0200 Subject: [PATCH 033/111] Small change that allows for much faster permutation variable importance calculation for high-dimensional data --- src/Tree.cpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/Tree.cpp b/src/Tree.cpp index 4305ab15f..0aa9a773a 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -227,6 +227,34 @@ void Tree::computePermutationImportance(std::vector& forest_importance, // Randomly permute for all independent variables for (size_t i = 0; i < num_independent_variables; ++i) { + // Check whether the i-th variable is used in the + // tree: + bool isused = false; + for (size_t j = 0; j < split_varIDs.size(); ++j) + { + if (split_varIDs[j] == i) + { + isused = true; + break; + } + } + + // Only if the variable is used in the tree, the OOB predictions + // can possibly change by permuting the OOB observations. + // Therefore, we only need to permute the OOB observations and + // re-calculate the predictions, if the variable is used in the tree. + // Otherwise 'accuracy_normal' and 'accuracy_permuted' would + // be the same, which is why their difference would be zero + // and we would correspondlgy add nothing (zero) to the sum 'forest_importance[i]' + // of the differences between the accuracies 'accuracy_normal' and + // 'accuracy_permuted'. + // Therefore, the following part is only performed if the variable + // is used in the tree (this condition makes the computations much + // less expensive, in particular for high-dimensional data because + // here most variables will not be used in most trees): + if (isused) + { + // Permute and compute prediction accuracy again for this permutation and save difference permuteAndPredictOobSamples(i, permutations); double accuracy_permuted; @@ -249,6 +277,9 @@ void Tree::computePermutationImportance(std::vector& forest_importance, } else if (importance_mode == IMP_PERM_LIAW) { forest_variance[i] += accuracy_difference * accuracy_difference * num_samples_oob; } + + } + } } From e7dddd8c1d168ec8d32c834608688c10e574c959 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 25 May 2022 09:15:18 +0200 Subject: [PATCH 034/111] formatting and indentation --- src/Tree.cpp | 82 ++++++++++++++++++++-------------------------------- 1 file changed, 32 insertions(+), 50 deletions(-) diff --git a/src/Tree.cpp b/src/Tree.cpp index 0aa9a773a..af9231f68 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -227,59 +227,41 @@ void Tree::computePermutationImportance(std::vector& forest_importance, // Randomly permute for all independent variables for (size_t i = 0; i < num_independent_variables; ++i) { - // Check whether the i-th variable is used in the - // tree: - bool isused = false; - for (size_t j = 0; j < split_varIDs.size(); ++j) - { - if (split_varIDs[j] == i) - { - isused = true; - break; - } - } - - // Only if the variable is used in the tree, the OOB predictions - // can possibly change by permuting the OOB observations. - // Therefore, we only need to permute the OOB observations and - // re-calculate the predictions, if the variable is used in the tree. - // Otherwise 'accuracy_normal' and 'accuracy_permuted' would - // be the same, which is why their difference would be zero - // and we would correspondlgy add nothing (zero) to the sum 'forest_importance[i]' - // of the differences between the accuracies 'accuracy_normal' and - // 'accuracy_permuted'. - // Therefore, the following part is only performed if the variable - // is used in the tree (this condition makes the computations much - // less expensive, in particular for high-dimensional data because - // here most variables will not be used in most trees): - if (isused) - { - - // Permute and compute prediction accuracy again for this permutation and save difference - permuteAndPredictOobSamples(i, permutations); - double accuracy_permuted; - if (importance_mode == IMP_PERM_CASEWISE) { - accuracy_permuted = computePredictionAccuracyInternal(&prederr_shuf_casewise); - for (size_t j = 0; j < num_samples_oob; ++j) { - size_t pos = i * num_samples + oob_sampleIDs[j]; - forest_importance_casewise[pos] += prederr_shuf_casewise[j] - prederr_normal_casewise[j]; + // Check whether the i-th variable is used in the + // tree: + bool isused = false; + for (size_t j = 0; j < split_varIDs.size(); ++j) { + if (split_varIDs[j] == i) { + isused = true; + break; } - } else { - accuracy_permuted = computePredictionAccuracyInternal(NULL); } - - double accuracy_difference = accuracy_normal - accuracy_permuted; - forest_importance[i] += accuracy_difference; - - // Compute variance - if (importance_mode == IMP_PERM_BREIMAN) { - forest_variance[i] += accuracy_difference * accuracy_difference; - } else if (importance_mode == IMP_PERM_LIAW) { - forest_variance[i] += accuracy_difference * accuracy_difference * num_samples_oob; + + // Only do permutations if the variable is used in the tree, otherwise variable importance is 0 + if (isused) { + // Permute and compute prediction accuracy again for this permutation and save difference + permuteAndPredictOobSamples(i, permutations); + double accuracy_permuted; + if (importance_mode == IMP_PERM_CASEWISE) { + accuracy_permuted = computePredictionAccuracyInternal(&prederr_shuf_casewise); + for (size_t j = 0; j < num_samples_oob; ++j) { + size_t pos = i * num_samples + oob_sampleIDs[j]; + forest_importance_casewise[pos] += prederr_shuf_casewise[j] - prederr_normal_casewise[j]; + } + } else { + accuracy_permuted = computePredictionAccuracyInternal(NULL); + } + + double accuracy_difference = accuracy_normal - accuracy_permuted; + forest_importance[i] += accuracy_difference; + + // Compute variance + if (importance_mode == IMP_PERM_BREIMAN) { + forest_variance[i] += accuracy_difference * accuracy_difference; + } else if (importance_mode == IMP_PERM_LIAW) { + forest_variance[i] += accuracy_difference * accuracy_difference * num_samples_oob; + } } - - } - } } From a3855daa89e9775538ad742f02fad9e659f14baa Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 25 May 2022 09:20:33 +0200 Subject: [PATCH 035/111] new version --- DESCRIPTION | 4 ++-- NEWS.md | 3 +++ cpp_version/src/version.h | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 794502318..c87d96629 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.13.2 -Date: 2022-03-03 +Version: 0.13.3 +Date: 2022-05-25 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS.md b/NEWS.md index 80b45f249..e2c8d8cf5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.13.3 +* Faster permutation variable importance for high dimensional data (thanks to Roman Hornung) + # ranger 0.13.2 * Add deforest() function to remove trees from ensemble * Fix cross compiling for Windows diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index ab3f8899c..2ba33290d 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.13.2" +#define RANGER_VERSION "0.13.3" #endif From dd1d02f3d0cb4e052abcff8f0abf7d83b51a6012 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 9 Jun 2022 11:27:24 +0200 Subject: [PATCH 036/111] allow split.select.weights and always.split.variables together, issue #618 --- DESCRIPTION | 7 +++---- NEWS.md | 3 +++ R/ranger.R | 7 ++----- man/ranger.Rd | 3 ++- src/Forest.cpp | 8 +++++++- src/RcppExports.cpp | 5 +++++ 6 files changed, 22 insertions(+), 11 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c87d96629..c7a0c2e3f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.13.3 -Date: 2022-05-25 +Version: 0.13.4 +Date: 2022-06-09 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high @@ -16,10 +16,9 @@ Imports: Rcpp (>= 0.11.2), Matrix LinkingTo: Rcpp, RcppEigen Depends: R (>= 3.1) Suggests: - covr, survival, testthat Encoding: UTF-8 -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.0 URL: https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/NEWS.md b/NEWS.md index e2c8d8cf5..367d4e249 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.13.4 +* Allow split.select.weights and always.split.variables together + # ranger 0.13.3 * Faster permutation variable importance for high dimensional data (thanks to Roman Hornung) diff --git a/R/ranger.R b/R/ranger.R index bebe3ea21..81f408eee 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -50,6 +50,7 @@ ##' In \code{split.select.weights}, weights do not need to sum up to 1, they will be normalized later. ##' The weights are assigned to the variables in the order they appear in the formula or in the data if no formula is used. ##' Names of the \code{split.select.weights} vector are ignored. +##' Weights assigned by \code{split.select.weights} to variables in \code{always.split.variables} are ignored. ##' The usage of \code{split.select.weights} can increase the computation times for large forests. ##' ##' Unordered factor covariates can be handled in 3 different ways by using \code{respect.unordered.factors}: @@ -102,7 +103,7 @@ ##' @param num.random.splits For "extratrees" splitrule.: Number of random splits to consider for each candidate splitting variable. ##' @param alpha For "maxstat" splitrule: Significance threshold to allow splitting. ##' @param minprop For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting. -##' @param split.select.weights Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used. +##' @param split.select.weights Numeric vector with weights between 0 and 1, used to calculate the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used. ##' @param always.split.variables Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting. ##' @param respect.unordered.factors Handling of unordered factor covariates. One of 'ignore', 'order' and 'partition'. For the "extratrees" splitrule the default is "partition" for all other splitrules 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used. See below for details. ##' @param scale.permutation.importance Scale permutation importance by standard error as in (Breiman 2001). Only applicable if permutation variable importance mode selected. @@ -668,10 +669,6 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, use.always.split.variables <- TRUE } - if (use.split.select.weights && use.always.split.variables) { - stop("Error: Please use only one option of split.select.weights and always.split.variables.") - } - ## Splitting rule if (is.null(splitrule)) { if (treetype == 5) { diff --git a/man/ranger.Rd b/man/ranger.Rd index 410cd7fe5..02cecfdab 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -81,7 +81,7 @@ ranger( \item{minprop}{For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting.} -\item{split.select.weights}{Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.} +\item{split.select.weights}{Numeric vector with weights between 0 and 1, used to calculate the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.} \item{always.split.variables}{Character vector with variable names to be always selected in addition to the \code{mtry} variables tried for splitting.} @@ -173,6 +173,7 @@ Variables selected with \code{always.split.variables} are tried additionally to In \code{split.select.weights}, weights do not need to sum up to 1, they will be normalized later. The weights are assigned to the variables in the order they appear in the formula or in the data if no formula is used. Names of the \code{split.select.weights} vector are ignored. +Weights assigned by \code{split.select.weights} to variables in \code{always.split.variables} are ignored. The usage of \code{split.select.weights} can increase the computation times for large forests. Unordered factor covariates can be handled in 3 different ways by using \code{respect.unordered.factors}: diff --git a/src/Forest.cpp b/src/Forest.cpp index f914d44ee..3a90a3116 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -964,6 +964,12 @@ void Forest::setSplitWeightVector(std::vector>& split_select this->split_select_weights.clear(); this->split_select_weights.resize(num_trees, std::vector(num_weights)); } + + // Deterministic varIDs + std::vector is_deterministic(num_weights, false); + for (size_t i = 0; i < deterministic_varIDs.size(); ++i) { + is_deterministic[i] = true; + } // Split up in deterministic and weighted variables, ignore zero weights for (size_t i = 0; i < split_select_weights.size(); ++i) { @@ -977,7 +983,7 @@ void Forest::setSplitWeightVector(std::vector>& split_select for (size_t j = 0; j < split_select_weights[i].size(); ++j) { double weight = split_select_weights[i][j]; - if (weight == 0) { + if (weight == 0 || is_deterministic[j]) { ++num_zero_weights; } else if (weight < 0 || weight > 1) { throw std::runtime_error("One or more split select weights not in range [0,1]."); diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 499d16890..931661a93 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -7,6 +7,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // rangerCpp Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth); RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP) { From a46fb4722d52482161b1e8eb17ec2ae54232ab79 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 9 Jun 2022 13:11:02 +0200 Subject: [PATCH 037/111] update GH actions workflow to new version --- .github/workflows/R-CMD-check-cran.yaml | 78 ------------------------- .github/workflows/R-CMD-check.yaml | 76 ++++++------------------ 2 files changed, 19 insertions(+), 135 deletions(-) delete mode 100644 .github/workflows/R-CMD-check-cran.yaml diff --git a/.github/workflows/R-CMD-check-cran.yaml b/.github/workflows/R-CMD-check-cran.yaml deleted file mode 100644 index dc1a86b9b..000000000 --- a/.github/workflows/R-CMD-check-cran.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. -# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions -on: workflow_dispatch - -name: R-CMD-check-cran - -jobs: - R-CMD-check: - runs-on: ${{ matrix.config.os }} - - name: ${{ matrix.config.os }} (${{ matrix.config.r }}) - - strategy: - fail-fast: false - matrix: - config: - - {os: windows-latest, r: 'devel'} - - {os: macOS-latest, r: 'devel'} - - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - - steps: - - uses: actions/checkout@v2 - - - uses: r-lib/actions/setup-r@v1 - with: - r-version: ${{ matrix.config.r }} - - - uses: r-lib/actions/setup-pandoc@v1 - - - name: Query dependencies - run: | - install.packages('remotes') - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") - shell: Rscript {0} - - - name: Restore R package cache - if: runner.os != 'Windows' - uses: actions/cache@v2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- - - - name: Install system dependencies - if: runner.os == 'Linux' - run: | - while read -r cmd - do - eval sudo $cmd - done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') - - - name: Install dependencies - run: | - remotes::install_deps(dependencies = TRUE) - remotes::install_cran("rcmdcheck") - shell: Rscript {0} - - - name: Check - env: - _R_CHECK_CRAN_INCOMING_: true - _R_CHECK_CRAN_INCOMING_REMOTE_: true - run: | - options(crayon.enabled = TRUE) - rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") - shell: Rscript {0} - - - name: Upload check results - if: failure() - uses: actions/upload-artifact@main - with: - name: ${{ runner.os }}-r${{ matrix.config.r }}-results - path: check diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 8bcc15c48..2ceeced34 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,14 +1,10 @@ -# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. -# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: - - main - - master + branches: [main, master] pull_request: - branches: - - main - - master + branches: [main, master] name: R-CMD-check @@ -22,66 +18,32 @@ jobs: fail-fast: false matrix: config: + - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: macOS-latest, r: 'release'} - - {os: ubuntu-20.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true - - uses: r-lib/actions/setup-pandoc@v1 - - - name: Query dependencies - run: | - install.packages('remotes') - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") - shell: Rscript {0} - - - name: Restore R package cache - if: runner.os != 'Windows' - uses: actions/cache@v2 + - uses: r-lib/actions/setup-r-dependencies@v2 with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- - - - name: Install system dependencies - if: runner.os == 'Linux' - run: | - while read -r cmd - do - eval sudo $cmd - done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') - - - name: Install dependencies - run: | - remotes::install_deps(dependencies = TRUE) - remotes::install_cran("rcmdcheck") - shell: Rscript {0} - - - name: Check - env: - _R_CHECK_CRAN_INCOMING_REMOTE_: false - run: | - options(crayon.enabled = TRUE) - rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") - shell: Rscript {0} + extra-packages: any::rcmdcheck + needs: check - - name: Upload check results - if: failure() - uses: actions/upload-artifact@main + - uses: r-lib/actions/check-r-package@v2 with: - name: ${{ runner.os }}-r${{ matrix.config.r }}-results - path: check + upload-snapshots: true \ No newline at end of file From 332cbcf5cd91096b0573b904c2696b25d4fca4f3 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 16 Jun 2022 10:14:47 +0200 Subject: [PATCH 038/111] fix csrf() weight calculation, fixes #615 --- DESCRIPTION | 4 ++-- NEWS.md | 3 +++ R/csrf.R | 2 +- cpp_version/src/version.h | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c7a0c2e3f..422310a5f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.13.4 -Date: 2022-06-09 +Version: 0.13.5 +Date: 2022-06-16 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS.md b/NEWS.md index 367d4e249..f524b2d88 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.13.5 +* Fix weight calculation in case-specific RF (csrf()) + # ranger 0.13.4 * Allow split.select.weights and always.split.variables together diff --git a/R/csrf.R b/R/csrf.R index ad7c1a1a6..f1e1b3f2a 100644 --- a/R/csrf.R +++ b/R/csrf.R @@ -83,7 +83,7 @@ csrf <- function(formula, training_data, test_data, params1 = list(), params2 = } ## Compute weights from first RF - num.same.node <- rowSums(terminal.nodeIDs.test[i, ] == terminal.nodeIDs.train) + num.same.node <- colSums(terminal.nodeIDs.test[i, ] == t(terminal.nodeIDs.train)) weights <- num.same.node / sum(num.same.node) ## Grow weighted RF diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 2ba33290d..f226e8f2d 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.13.3" +#define RANGER_VERSION "0.13.5" #endif From e38a2271dee1767d43ff4ee8c959c73ff519ee1a Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 16 Jun 2022 10:57:20 +0200 Subject: [PATCH 039/111] add as.data.frame() method for predictions, #607 --- NAMESPACE | 1 + NEWS.md | 1 + R/predictions.R | 20 +++++++++++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index b27cb67bd..3aad2ad21 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(as.data.frame,ranger.prediction) S3method(deforest,ranger) S3method(importance,ranger) S3method(predict,ranger) diff --git a/NEWS.md b/NEWS.md index f524b2d88..205249777 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # ranger 0.13.5 * Fix weight calculation in case-specific RF (csrf()) +* Add as.data.frame() method for predictions # ranger 0.13.4 * Allow split.select.weights and always.split.variables together diff --git a/R/predictions.R b/R/predictions.R index 9c9463410..ba7595ea2 100644 --- a/R/predictions.R +++ b/R/predictions.R @@ -71,7 +71,7 @@ predictions.ranger.prediction <- function(x, ...) { ##' @seealso \code{\link{ranger}} ##' @author Marvin N. Wright ##' @export -predictions.ranger<- function(x, ...) { +predictions.ranger <- function(x, ...) { if (!inherits(x, "ranger")) { stop("Object ist no ranger object.") } @@ -91,3 +91,21 @@ predictions.ranger<- function(x, ...) { stop("Unknown tree type.") } } + +##' @export +as.data.frame.ranger.prediction <- function(x, ...) { + if (x$treetype == "Survival") { + df <- data.frame(x$survival) + colnames(df) <- paste0("time=", x$unique.death.times) + } else if (x$treetype == "Probability estimation") { + df <- data.frame(x$predictions) + } else { + df <- data.frame(prediction = x$predictions) + } + + if (!is.null(x$se)) { + df$se <- x$se + } + + df +} From d8a9d4b636ffbb2a0105320e5f8d2bda01f882e1 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 16 Jun 2022 13:39:51 +0200 Subject: [PATCH 040/111] prepare new cran release --- DESCRIPTION | 2 +- NEWS | 10 +++++++++- NEWS.md | 3 +++ cpp_version/src/version.h | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 422310a5f..3cfdf6631 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.13.5 +Version: 0.14.0 Date: 2022-06-16 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright diff --git a/NEWS b/NEWS index 747a902eb..503005ffb 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,13 @@ -##### Version 0.13 +##### Version 0.14.0 +* Faster permutation variable importance for high dimensional data (thanks to Roman Hornung) +* Add deforest() function to remove trees from ensemble +* Allow split.select.weights and always.split.variables together +* Add as.data.frame() method for predictions +* Fix weight calculation in case-specific RF (csrf()) +* Fix cross compiling for Windows + +##### Version 0.13.0 * Faster quantile prediction * Add ... argument to ranger() * Bug fixes diff --git a/NEWS.md b/NEWS.md index 205249777..4a9c3f5ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.14.0 +* New CRAN version + # ranger 0.13.5 * Fix weight calculation in case-specific RF (csrf()) * Add as.data.frame() method for predictions diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index f226e8f2d..d3b3f9b5e 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.13.5" +#define RANGER_VERSION "0.14.0" #endif From bccb98cad99afa6daa25935067e2b621f217f56d Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 17 Jun 2022 09:04:58 +0200 Subject: [PATCH 041/111] URL etc fixes for CRAN --- .Rbuildignore | 4 ++++ .gitignore | 3 +++ DESCRIPTION | 4 ++-- NEWS.md | 3 +++ R/deforest.R | 2 +- cpp_version/src/version.h | 2 +- man/deforest.Rd | 2 +- src/Forest.cpp | 2 +- 8 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 7648e208a..8d8c64dce 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,3 +10,7 @@ ^_pkgdown\.yml$ ^docs$ ^pkgdown$ +^ranger\.Rcheck$ +^ranger.*\.tar\.gz$ +^ranger.*\.tgz$ +revdep/* diff --git a/.gitignore b/.gitignore index 96f14cc59..47ec7a386 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ ranger.Rcheck/ .project .cproject docs +ranger*.tar.gz +ranger*.tgz +revdep/* \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 3cfdf6631..0d7062f58 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.14.0 -Date: 2022-06-16 +Version: 0.14.1 +Date: 2022-06-17 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS.md b/NEWS.md index 4a9c3f5ab..759ccedf5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.14.1 +* URL fix for CRAN + # ranger 0.14.0 * New CRAN version diff --git a/R/deforest.R b/R/deforest.R index dbe4bef66..f81afc223 100644 --- a/R/deforest.R +++ b/R/deforest.R @@ -37,7 +37,7 @@ #' @references #' Friedman, J. and Popescu, B. (2003). Importance sampled learning ensembles, #' Technical report, Stanford University, Department of Statistics. -#' \url{https://statweb.stanford.edu/~jhf/ftp/isle.pdf}. +#' \url{https://jerryfriedman.su.domains/ftp/isle.pdf}. #' #' @rdname deforest #' diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index d3b3f9b5e..1d643dcd8 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.14.0" +#define RANGER_VERSION "0.14.1" #endif diff --git a/man/deforest.Rd b/man/deforest.Rd index 2e583a2cf..a8a4c85b6 100644 --- a/man/deforest.Rd +++ b/man/deforest.Rd @@ -61,7 +61,7 @@ identical(preds.rfo[, , -c(1, 3, 5)], y = preds.dfo) \references{ Friedman, J. and Popescu, B. (2003). Importance sampled learning ensembles, Technical report, Stanford University, Department of Statistics. -\url{https://statweb.stanford.edu/~jhf/ftp/isle.pdf}. +\url{https://jerryfriedman.su.domains/ftp/isle.pdf}. } \author{ Brandon M. Greenwell diff --git a/src/Forest.cpp b/src/Forest.cpp index 3a90a3116..7d46a64c1 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -343,7 +343,7 @@ void Forest::writeOutput() { *verbose_out << std::endl; } - if (!split_select_weights.empty() & !split_select_weights[0].empty()) { + if (!split_select_weights.empty() && !split_select_weights[0].empty()) { if (verbose_out) { *verbose_out << "Warning: Split select weights used. Variable importance measures are only comparable for variables with equal weights." From d8522254bf421b66030295dace81d1928232d28b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 20 Jul 2022 11:55:47 +0200 Subject: [PATCH 042/111] add test for #627 --- tests/testthat/test_char.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/testthat/test_char.R b/tests/testthat/test_char.R index 883e06e03..090126517 100644 --- a/tests/testthat/test_char.R +++ b/tests/testthat/test_char.R @@ -19,3 +19,13 @@ test_that("no warning or error if character vector in data, alternative interfac num.trees = 5, write.forest = TRUE)) expect_silent(predict(rf, dat)) }) + +test_that("same result for single row", { + rf <- ranger(Species ~ ., dat, num.trees = 5, probability = TRUE) + pred1 <- predict(rf, dat)$predictions + pred2 <- t(sapply(1:nrow(dat), function(i) { + predict(rf, dat[i, ])$predictions + })) + colnames(pred2) <- colnames(pred1) + expect_equal(pred1, pred2) +}) From fee0749b78a7fa71f6f7464d28e87688a98366ec Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 20 Jul 2022 12:33:32 +0200 Subject: [PATCH 043/111] fix old unordered test --- tests/testthat/test_unordered.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_unordered.R b/tests/testthat/test_unordered.R index b97dfca3f..f14656d1f 100644 --- a/tests/testthat/test_unordered.R +++ b/tests/testthat/test_unordered.R @@ -13,7 +13,7 @@ test_that("Old parameters still work", { rf.true <- ranger(y ~ ., data = dt, num.trees = 5, write.forest = TRUE, respect.unordered.factors = TRUE) - expect_null(rf.false$forest$covariate.levels) + expect_equal(rf.false$forest$covariate.levels$x, levels(factor(dt$x))) expect_equal(length(rf.true$forest$covariate.levels), 1) }) From 6b7febd86bf37d528d630b35ce29e96b53cff4f4 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 20 Jul 2022 13:49:58 +0200 Subject: [PATCH 044/111] save factor levels also for character features --- R/ranger.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index 81f408eee..5f79081d6 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -354,6 +354,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } ## Recode characters as factors and recode factors if 'order' mode + covariate.levels <- NULL if (!is.matrix(x) && !inherits(x, "Matrix") && ncol(x) > 0) { character.idx <- sapply(x, is.character) @@ -408,13 +409,15 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, ## Return reordered factor factor(xx, levels = levels.ordered, ordered = TRUE, exclude = NULL) }) - - ## Save levels - covariate.levels <- lapply(x, levels) } else { ## Recode characters only x[character.idx] <- lapply(x[character.idx], factor) } + + ## Save levels + if (any(sapply(x, is.factor))) { + covariate.levels <- lapply(x, levels) + } } ## If gwa mode, add snp variable names @@ -957,8 +960,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, result$forest$treetype <- result$treetype class(result$forest) <- "ranger.forest" - ## In 'ordered' mode, save covariate levels - if (respect.unordered.factors == "order" && ncol(x) > 0) { + ## Save covariate levels + if (!is.null(covariate.levels)) { result$forest$covariate.levels <- covariate.levels } } From 6e3385043da74e1d75cc0d4d595d558441224c0e Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 20 Jul 2022 16:48:05 +0200 Subject: [PATCH 045/111] test and fix #626 --- R/treeInfo.R | 6 +-- tests/testthat/test_treeInfo.R | 84 ++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/R/treeInfo.R b/R/treeInfo.R index a27682608..467b7eb7d 100644 --- a/R/treeInfo.R +++ b/R/treeInfo.R @@ -120,16 +120,16 @@ treeInfo <- function(object, tree = 1) { result$prediction <- forest$split.values[[tree]] result$prediction[!result$terminal] <- NA if (!is.null(forest$levels)) { - result$prediction <- factor(result$prediction, levels = forest$class.values, labels = forest$levels) + result$prediction <- integer.to.factor(result$prediction, labels = forest$levels) } } else if (forest$treetype == "Regression") { result$prediction <- forest$split.values[[tree]] result$prediction[!result$terminal] <- NA } else if (forest$treetype == "Probability estimation") { - predictions <- matrix(nrow = nrow(result), ncol = length(forest$levels)) + predictions <- matrix(nrow = nrow(result), ncol = length(forest$class.values)) predictions[result$terminal, ] <- do.call(rbind, forest$terminal.class.counts[[tree]]) colnames(predictions) <- forest$levels[forest$class.values] - predictions <- predictions[, forest$levels, drop = FALSE] + predictions <- predictions[, forest$levels[sort(forest$class.values)], drop = FALSE] colnames(predictions) <- paste0("pred.", colnames(predictions)) result <- data.frame(result, predictions) } else if (forest$treetype == "Survival") { diff --git a/tests/testthat/test_treeInfo.R b/tests/testthat/test_treeInfo.R index 02b7d0a74..449db853d 100644 --- a/tests/testthat/test_treeInfo.R +++ b/tests/testthat/test_treeInfo.R @@ -51,6 +51,90 @@ test_that("Prediction for classification is factor with correct levels", { expect_equal(levels(ti.class.formula$prediction), levels(iris$Species)) }) +test_that("Prediction for classification is same as class prediction", { + dat <- iris[sample(nrow(iris)), ] + rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 1, + replace = FALSE, sample.fraction = 1) + pred_class <- predict(rf, dat)$predictions + nodes <- predict(rf, dat, type = "terminalNodes")$predictions[, 1] + ti <- treeInfo(rf, 1) + pred_ti <- sapply(nodes, function(x) { + ti[ti$nodeID == x, "prediction"] + }) + expect_equal(pred_ti, pred_class) +}) + +test_that("Prediction for classification is same as class prediction, new factor", { + dat <- iris[sample(nrow(iris)), ] + dat$Species <- factor(dat$Species, levels = sample(levels(dat$Species))) + rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 1, + replace = FALSE, sample.fraction = 1) + pred_class <- predict(rf, dat)$predictions + nodes <- predict(rf, dat, type = "terminalNodes")$predictions[, 1] + ti <- treeInfo(rf, 1) + pred_ti <- sapply(nodes, function(x) { + ti[ti$nodeID == x, "prediction"] + }) + expect_equal(pred_ti, pred_class) +}) + +test_that("Prediction for classification is same as class prediction, unused factor levels", { + dat <- iris[c(101:150, 51:100), ] + expect_warning(rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 1, + replace = FALSE, sample.fraction = 1)) + pred_class <- predict(rf, dat)$predictions + nodes <- predict(rf, dat, type = "terminalNodes")$predictions[, 1] + ti <- treeInfo(rf, 1) + pred_ti <- sapply(nodes, function(x) { + ti[ti$nodeID == x, "prediction"] + }) + expect_equal(pred_ti, pred_class) +}) + +test_that("Prediction for probability is same as probability prediction", { + dat <- iris[sample(nrow(iris)), ] + rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 1, + sample.fraction = 1, replace = FALSE, probability = TRUE) + ti <- treeInfo(rf) + pred_prob <- predict(rf, dat)$predictions + nodes <- predict(rf, dat, type = "terminalNodes")$predictions[, 1] + pred_ti <- t(sapply(nodes, function(x) { + as.matrix(ti[ti$nodeID == x, 8:10]) + })) + colnames(pred_ti) <- gsub("pred\\.", "", colnames(ti)[8:10]) + expect_equal(pred_prob, pred_ti) +}) + +test_that("Prediction for probability is same as probability prediction, new factor", { + dat <- iris[sample(nrow(iris)), ] + dat$Species <- factor(dat$Species, levels = sample(levels(dat$Species))) + rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 1, + sample.fraction = 1, replace = FALSE, probability = TRUE) + ti <- treeInfo(rf) + pred_prob <- predict(rf, dat)$predictions + nodes <- predict(rf, dat, type = "terminalNodes")$predictions[, 1] + pred_ti <- t(sapply(nodes, function(x) { + as.matrix(ti[ti$nodeID == x, 8:10]) + })) + colnames(pred_ti) <- gsub("pred\\.", "", colnames(ti)[8:10]) + expect_equal(pred_prob, pred_ti) +}) + +test_that("Prediction for probability is same as probability prediction, unused factor levels", { + dat <- iris[c(101:150, 51:100), ] + dat$Species <- factor(dat$Species, levels = sample(levels(dat$Species))) + expect_warning(rf <- ranger(dependent.variable.name = "Species", data = dat, num.trees = 1, + sample.fraction = 1, replace = FALSE, probability = TRUE)) + ti <- treeInfo(rf) + pred_prob <- predict(rf, dat)$predictions + nodes <- predict(rf, dat, type = "terminalNodes")$predictions[, 1] + pred_ti <- t(sapply(nodes, function(x) { + as.matrix(ti[ti$nodeID == x, 8:9]) + })) + colnames(pred_ti) <- gsub("pred\\.", "", colnames(ti)[8:9]) + expect_equal(pred_prob, pred_ti) +}) + test_that("Prediction for matrix classification is integer with correct values", { rf <- ranger(dependent.variable.name = "Species", data = data.matrix(iris), num.trees = 5, classification = TRUE) From 5f6d82aef4c03d6450f0eed91db5e88d143ef9d2 Mon Sep 17 00:00:00 2001 From: "Marvin N. Wright" Date: Fri, 22 Jul 2022 09:13:54 +0200 Subject: [PATCH 046/111] add sponsors button --- .github/FUNDING.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 000000000..dc350c386 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +github: mnwright From 54050e50334deaa8fcc9d2498958f10cc2f22ae0 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 3 Nov 2022 09:02:41 +0100 Subject: [PATCH 047/111] add min.bucket parameter to restrict leaf size --- DESCRIPTION | 6 +-- NEWS | 3 ++ NEWS.md | 3 ++ R/RcppExports.R | 4 +- R/predict.R | 3 +- R/ranger.R | 22 ++++++--- cpp_version/src/version.h | 2 +- man/ranger.Rd | 9 ++-- src/Forest.cpp | 15 +++--- src/Forest.h | 10 ++-- src/ForestClassification.cpp | 5 ++ src/ForestProbability.cpp | 5 ++ src/ForestRegression.cpp | 5 ++ src/ForestSurvival.cpp | 5 ++ src/RcppExports.cpp | 9 ++-- src/Tree.cpp | 7 +-- src/Tree.h | 7 ++- src/TreeClassification.cpp | 87 +++++++++++++++++++++++----------- src/TreeProbability.cpp | 87 +++++++++++++++++++++++----------- src/TreeRegression.cpp | 92 ++++++++++++++++++++++++++---------- src/TreeSurvival.cpp | 24 +++++----- src/globals.h | 2 + src/rangerCpp.cpp | 4 +- tests/testthat/test_ranger.R | 34 +++++++++++++ 24 files changed, 321 insertions(+), 129 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0d7062f58..8641e7e20 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.14.1 -Date: 2022-06-17 +Version: 0.14.2 +Date: 2022-11-03 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high @@ -19,6 +19,6 @@ Suggests: survival, testthat Encoding: UTF-8 -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.1 URL: https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/NEWS b/NEWS index 503005ffb..d42d48b8a 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,7 @@ +##### Version 0.14.2 +* Add min.bucket parameter to restrict terminal node size + ##### Version 0.14.0 * Faster permutation variable importance for high dimensional data (thanks to Roman Hornung) * Add deforest() function to remove trees from ensemble diff --git a/NEWS.md b/NEWS.md index 759ccedf5..afe0f7571 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.14.2 +* Add min.bucket parameter to restrict terminal node size + # ranger 0.14.1 * URL fix for CRAN diff --git a/R/RcppExports.R b/R/RcppExports.R index 2c775d8b4..19cc8e8ac 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) { - .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) +rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) { + .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) } numSmaller <- function(values, reference) { diff --git a/R/predict.R b/R/predict.R index 1c06d8cfc..3d3948a8b 100644 --- a/R/predict.R +++ b/R/predict.R @@ -219,6 +219,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, mtry <- 0 importance <- 0 min.node.size <- 0 + min.bucket <- 0 split.select.weights <- list(c(0, 0)) use.split.select.weights <- FALSE always.split.variables <- c("0", "0") @@ -264,7 +265,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, ## Call Ranger result <- rangerCpp(treetype, x, y, forest$independent.variable.names, mtry, num.trees, verbose, seed, num.threads, write.forest, importance, - min.node.size, split.select.weights, use.split.select.weights, + min.node.size, min.bucket, split.select.weights, use.split.select.weights, always.split.variables, use.always.split.variables, prediction.mode, forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule, diff --git a/R/ranger.R b/R/ranger.R index 5f79081d6..b4d161226 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -44,8 +44,8 @@ ##' In contrast to other implementations, each tree returns a probability estimate and these estimates are averaged for the forest probability estimate. ##' For details see Malley et al. (2012). ##' -##' Note that for classification and regression nodes with size smaller than \code{min.node.size} can occur, as in original Random Forests. -##' For survival all nodes contain at \code{min.node.size} samples. +##' Note that nodes with size smaller than \code{min.node.size} can occur because \code{min.node.size} is the minimal node size \emph{to split at}, as in original Random Forests. +##' To restrict the size of terminal nodes, set \code{min.bucket}. ##' Variables selected with \code{always.split.variables} are tried additionally to the mtry variables randomly selected. ##' In \code{split.select.weights}, weights do not need to sum up to 1, they will be normalized later. ##' The weights are assigned to the variables in the order they appear in the formula or in the data if no formula is used. @@ -93,7 +93,8 @@ ##' @param importance Variable importance mode, one of 'none', 'impurity', 'impurity_corrected', 'permutation'. The 'impurity' measure is the Gini index for classification, the variance of the responses for regression and the sum of test statistics (see \code{splitrule}) for survival. ##' @param write.forest Save \code{ranger.forest} object, required for prediction. Set to \code{FALSE} to reduce memory usage if no prediction intended. ##' @param probability Grow a probability forest as in Malley et al. (2012). -##' @param min.node.size Minimal node size. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability. +##' @param min.node.size Minimal node size to split at. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability. +##' @param min.bucket Minimal terminal node size. No nodes smaller than this value can occur. Default 3 for survival and 1 for all other tree types. ##' @param max.depth Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree). ##' @param replace Sample with replacement. ##' @param sample.fraction Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values. @@ -211,8 +212,8 @@ ##' @export ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, importance = "none", write.forest = TRUE, probability = FALSE, - min.node.size = NULL, max.depth = NULL, replace = TRUE, - sample.fraction = ifelse(replace, 1, 0.632), + min.node.size = NULL, min.bucket = NULL, max.depth = NULL, + replace = TRUE, sample.fraction = ifelse(replace, 1, 0.632), case.weights = NULL, class.weights = NULL, splitrule = NULL, num.random.splits = 1, alpha = 0.5, minprop = 0.1, split.select.weights = NULL, always.split.variables = NULL, @@ -490,12 +491,19 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, stop("Error: Invalid value for num.threads") } - ## Minumum node size + ## Minimum node size if (is.null(min.node.size)) { min.node.size <- 0 } else if (!is.numeric(min.node.size) || min.node.size < 0) { stop("Error: Invalid value for min.node.size") } + + ## Minimum bucket size + if (is.null(min.bucket)) { + min.bucket <- 0 + } else if (!is.numeric(min.bucket) || min.bucket < 0) { + stop("Error: Invalid value for min.bucket") + } ## Tree depth if (is.null(max.depth)) { @@ -857,7 +865,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, ## Call Ranger result <- rangerCpp(treetype, x, y.mat, independent.variable.names, mtry, num.trees, verbose, seed, num.threads, write.forest, importance.mode, - min.node.size, split.select.weights, use.split.select.weights, + min.node.size, min.bucket, split.select.weights, use.split.select.weights, always.split.variables, use.always.split.variables, prediction.mode, loaded.forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 1d643dcd8..d59bf779c 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.14.1" +#define RANGER_VERSION "0.14.2" #endif diff --git a/man/ranger.Rd b/man/ranger.Rd index 02cecfdab..2e41c79e4 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -13,6 +13,7 @@ ranger( write.forest = TRUE, probability = FALSE, min.node.size = NULL, + min.bucket = NULL, max.depth = NULL, replace = TRUE, sample.fraction = ifelse(replace, 1, 0.632), @@ -61,7 +62,9 @@ ranger( \item{probability}{Grow a probability forest as in Malley et al. (2012).} -\item{min.node.size}{Minimal node size. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability.} +\item{min.node.size}{Minimal node size to split at. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability.} + +\item{min.bucket}{Minimal terminal node size. No nodes smaller than this value can occur. Default 3 for survival and 1 for all other tree types.} \item{max.depth}{Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree).} @@ -167,8 +170,8 @@ Predictions are class probabilities for each sample. In contrast to other implementations, each tree returns a probability estimate and these estimates are averaged for the forest probability estimate. For details see Malley et al. (2012). -Note that for classification and regression nodes with size smaller than \code{min.node.size} can occur, as in original Random Forests. -For survival all nodes contain at \code{min.node.size} samples. +Note that nodes with size smaller than \code{min.node.size} can occur because \code{min.node.size} is the minimal node size \emph{to split at}, as in original Random Forests. +To restrict the size of terminal nodes, set \code{min.bucket}. Variables selected with \code{always.split.variables} are tried additionally to the mtry variables randomly selected. In \code{split.select.weights}, weights do not need to sum up to 1, they will be normalized later. The weights are assigned to the variables in the order they appear in the formula or in the data if no formula is used. diff --git a/src/Forest.cpp b/src/Forest.cpp index 7d46a64c1..9f6c6c493 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -29,7 +29,7 @@ namespace ranger { Forest::Forest() : - verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0), seed(0), num_samples( + verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), min_bucket(0), num_independent_variables(0), seed(0), num_samples( 0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), memory_saving_splitting( false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( { 1 }), holdout( false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth( @@ -40,7 +40,7 @@ Forest::Forest() : // #nocov start void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, - std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, + std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, std::string split_select_weights_file, const std::vector& always_split_variable_names, std::string status_variable_name, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, @@ -81,7 +81,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode // Call other init function init(loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, - min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, + min_node_size, min_bucket, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, false, max_depth, regularization_factor, regularization_usedepth); @@ -135,7 +135,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode // #nocov end void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, - uint num_threads, ImportanceMode importance_mode, uint min_node_size, + uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, @@ -147,7 +147,7 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, this->verbose_out = verbose_out; // Call other init function - init(std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, + init(std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, min_bucket, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth, regularization_factor, regularization_usedepth); @@ -180,7 +180,7 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, } void Forest::init(std::unique_ptr input_data, uint mtry, std::string output_prefix, - uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, @@ -215,6 +215,7 @@ void Forest::init(std::unique_ptr input_data, uint mtry, std::string outpu this->output_prefix = output_prefix; this->importance_mode = importance_mode; this->min_node_size = min_node_size; + this->min_bucket = min_bucket; this->prediction_mode = prediction_mode; this->sample_with_replacement = sample_with_replacement; this->memory_saving_splitting = memory_saving_splitting; @@ -477,7 +478,7 @@ void Forest::grow() { } trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights, - importance_mode, min_node_size, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, + importance_mode, min_node_size, min_bucket, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, num_random_splits, max_depth, ®ularization_factor, regularization_usedepth, &split_varIDs_used); } diff --git a/src/Forest.h b/src/Forest.h index a1275b43e..1f5cd6cbc 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -42,7 +42,7 @@ class Forest { // Init from c++ main or Rcpp from R void initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, - std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, + std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, std::string split_select_weights_file, const std::vector& always_split_variable_names, std::string status_variable_name, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, @@ -50,7 +50,7 @@ class Forest { bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); void initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, - uint num_threads, ImportanceMode importance_mode, uint min_node_size, + uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, @@ -59,7 +59,7 @@ class Forest { PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); void init(std::unique_ptr input_data, uint mtry, std::string output_prefix, - uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, @@ -122,6 +122,9 @@ class Forest { uint getMinNodeSize() const { return min_node_size; } + uint getMinBucket() const { + return min_bucket; + } size_t getNumIndependentVariables() const { return num_independent_variables; } @@ -189,6 +192,7 @@ class Forest { size_t num_trees; uint mtry; uint min_node_size; + uint min_bucket; size_t num_independent_variables; uint seed; size_t num_samples; diff --git a/src/ForestClassification.cpp b/src/ForestClassification.cpp index 18ec35fee..78b4df97d 100644 --- a/src/ForestClassification.cpp +++ b/src/ForestClassification.cpp @@ -58,6 +58,11 @@ void ForestClassification::initInternal() { min_node_size = DEFAULT_MIN_NODE_SIZE_CLASSIFICATION; } + // Set minimal bucket size + if (min_bucket == 0) { + min_bucket = DEFAULT_MIN_BUCKET; + } + // Create class_values and response_classIDs if (!prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { diff --git a/src/ForestProbability.cpp b/src/ForestProbability.cpp index 9a17bbad9..c03f30c8e 100644 --- a/src/ForestProbability.cpp +++ b/src/ForestProbability.cpp @@ -63,6 +63,11 @@ void ForestProbability::initInternal() { min_node_size = DEFAULT_MIN_NODE_SIZE_PROBABILITY; } + // Set minimal bucket size + if (min_bucket == 0) { + min_bucket = DEFAULT_MIN_BUCKET; + } + // Create class_values and response_classIDs if (!prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { diff --git a/src/ForestRegression.cpp b/src/ForestRegression.cpp index b721cb7ed..e6ad3774a 100644 --- a/src/ForestRegression.cpp +++ b/src/ForestRegression.cpp @@ -52,6 +52,11 @@ void ForestRegression::initInternal() { min_node_size = DEFAULT_MIN_NODE_SIZE_REGRESSION; } + // Set minimal bucket size + if (min_bucket == 0) { + min_bucket = DEFAULT_MIN_BUCKET; + } + // Error if beta splitrule used with data outside of [0,1] if (splitrule == BETA && !prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { diff --git a/src/ForestSurvival.cpp b/src/ForestSurvival.cpp index f5416c937..6cba9a18a 100644 --- a/src/ForestSurvival.cpp +++ b/src/ForestSurvival.cpp @@ -65,6 +65,11 @@ void ForestSurvival::initInternal() { min_node_size = DEFAULT_MIN_NODE_SIZE_SURVIVAL; } + // Set minimal bucket size + if (min_bucket == 0) { + min_bucket = DEFAULT_MIN_BUCKET_SURVIVAL; + } + // Create unique timepoints if (!prediction_mode) { std::set unique_timepoint_set; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 931661a93..65b57caba 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -13,8 +13,8 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // rangerCpp -Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth); -RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP) { +Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth); +RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP min_bucketSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -30,6 +30,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type write_forest(write_forestSEXP); Rcpp::traits::input_parameter< uint >::type importance_mode_r(importance_mode_rSEXP); Rcpp::traits::input_parameter< uint >::type min_node_size(min_node_sizeSEXP); + Rcpp::traits::input_parameter< uint >::type min_bucket(min_bucketSEXP); Rcpp::traits::input_parameter< std::vector>& >::type split_select_weights(split_select_weightsSEXP); Rcpp::traits::input_parameter< bool >::type use_split_select_weights(use_split_select_weightsSEXP); Rcpp::traits::input_parameter< std::vector& >::type always_split_variable_names(always_split_variable_namesSEXP); @@ -64,7 +65,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::vector& >::type regularization_factor(regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type use_regularization_factor(use_regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type regularization_usedepth(regularization_usedepthSEXP); - rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)); + rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)); return rcpp_result_gen; END_RCPP } @@ -95,7 +96,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 46}, + {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 47}, {"_ranger_numSmaller", (DL_FUNC) &_ranger_numSmaller, 2}, {"_ranger_randomObsNode", (DL_FUNC) &_ranger_randomObsNode, 3}, {NULL, NULL, 0} diff --git a/src/Tree.cpp b/src/Tree.cpp index af9231f68..c6ef6303f 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -17,7 +17,7 @@ namespace ranger { Tree::Tree() : - mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_weights(0), case_weights( + mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), min_bucket(0), deterministic_varIDs(0), split_select_weights(0), case_weights( 0), manual_inbag(0), oob_sampleIDs(0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth( false), split_varIDs_used(0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement( true), sample_fraction(0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( @@ -27,7 +27,7 @@ Tree::Tree() : Tree::Tree(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values) : - mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_weights(0), case_weights( + mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), min_bucket(0), deterministic_varIDs(0), split_select_weights(0), case_weights( 0), manual_inbag(0), split_varIDs(split_varIDs), split_values(split_values), child_nodeIDs(child_nodeIDs), oob_sampleIDs( 0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), split_varIDs_used( 0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement(true), sample_fraction( @@ -37,7 +37,7 @@ Tree::Tree(std::vector>& child_nodeIDs, std::vector& } void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, - std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, + std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, @@ -60,6 +60,7 @@ void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std: this->split_select_weights = split_select_weights; this->importance_mode = importance_mode; this->min_node_size = min_node_size; + this->min_bucket = min_bucket; this->sample_with_replacement = sample_with_replacement; this->splitrule = splitrule; this->case_weights = case_weights; diff --git a/src/Tree.h b/src/Tree.h index cb409b39c..3acbfa20f 100644 --- a/src/Tree.h +++ b/src/Tree.h @@ -36,7 +36,7 @@ class Tree { Tree& operator=(const Tree&) = delete; void init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, - std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, + std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, @@ -155,8 +155,11 @@ class Tree { // Number of OOB samples size_t num_samples_oob; - // Minimum node size to split, like in original RF nodes of smaller size can be produced + // Minimum node size to split, nodes of smaller size can be produced uint min_node_size; + + // Minimum bucket size, minimum number of samples in each node + uint min_bucket; // Weight vector for selecting possible split variables, one weight between 0 (never select) and 1 (always select) for each variable // Deterministic variables are always selected diff --git a/src/TreeClassification.cpp b/src/TreeClassification.cpp index 5a5f01db7..7353f47ab 100644 --- a/src/TreeClassification.cpp +++ b/src/TreeClassification.cpp @@ -158,29 +158,33 @@ bool TreeClassification::findBestSplit(size_t nodeID, std::vector& possi ++class_counts[sample_classID]; } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { - - // Use memory saving method if option set - if (memory_saving_splitting) { - findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } else { - // Use faster method for both cases - double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); - if (q < Q_THRESHOLD) { +// Stop early if no split posssible + if (num_samples_node >= 2 * min_bucket) { + + // For all possible split variables + for (auto& varID : possible_split_varIDs) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { + + // Use memory saving method if option set + if (memory_saving_splitting) { findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { - findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); + // Use faster method for both cases + double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); + if (q < Q_THRESHOLD) { + findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } else { + findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } } + } else { + findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); } - } else { - findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); } } @@ -264,6 +268,11 @@ void TreeClassification::findBestSplitValueSmallQ(size_t nodeID, size_t varID, s break; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + double decrease; if (splitrule == HELLINGER) { for (size_t j = 0; j < num_classes; ++j) { @@ -351,6 +360,11 @@ void TreeClassification::findBestSplitValueLargeQ(size_t nodeID, size_t varID, s break; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + double decrease; if (splitrule == HELLINGER) { for (size_t j = 0; j < num_classes; ++j) { @@ -457,6 +471,11 @@ void TreeClassification::findBestSplitValueUnordered(size_t nodeID, size_t varID } size_t n_left = num_samples_node - n_right; + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + double decrease; if (splitrule == HELLINGER) { // TPR is number of outcome 1s in one node / total number of 1s @@ -512,15 +531,19 @@ bool TreeClassification::findBestSplitExtraTrees(size_t nodeID, std::vectorisOrderedVariable(varID)) { - findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } else { - findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, - best_varID, best_decrease); + // Stop early if no split posssible + if (num_samples_node >= 2 * min_bucket) { + + // For all possible split variables + for (auto& varID : possible_split_varIDs) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { + findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } else { + findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, + best_varID, best_decrease); + } } } @@ -614,6 +637,11 @@ void TreeClassification::findBestSplitValueExtraTrees(size_t nodeID, size_t varI continue; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right[i] < min_bucket) { + continue; + } + // Sum of squares double sum_left = 0; double sum_right = 0; @@ -720,6 +748,11 @@ void TreeClassification::findBestSplitValueExtraTreesUnordered(size_t nodeID, si } size_t n_left = num_samples_node - n_right; + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + // Sum of squares double sum_left = 0; double sum_right = 0; diff --git a/src/TreeProbability.cpp b/src/TreeProbability.cpp index 042f9c6a4..5aade041f 100644 --- a/src/TreeProbability.cpp +++ b/src/TreeProbability.cpp @@ -158,29 +158,33 @@ bool TreeProbability::findBestSplit(size_t nodeID, std::vector& possible ++class_counts[sample_classID]; } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { - - // Use memory saving method if option set - if (memory_saving_splitting) { - findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } else { - // Use faster method for both cases - double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); - if (q < Q_THRESHOLD) { + // Stop early if no split posssible + if (num_samples_node >= 2 * min_bucket) { + + // For all possible split variables + for (auto& varID : possible_split_varIDs) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { + + // Use memory saving method if option set + if (memory_saving_splitting) { findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { - findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); + // Use faster method for both cases + double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); + if (q < Q_THRESHOLD) { + findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } else { + findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } } + } else { + findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); } - } else { - findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); } } @@ -264,6 +268,11 @@ void TreeProbability::findBestSplitValueSmallQ(size_t nodeID, size_t varID, size break; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + double decrease; if (splitrule == HELLINGER) { for (size_t j = 0; j < num_classes; ++j) { @@ -351,6 +360,11 @@ void TreeProbability::findBestSplitValueLargeQ(size_t nodeID, size_t varID, size break; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + double decrease; if (splitrule == HELLINGER) { for (size_t j = 0; j < num_classes; ++j) { @@ -457,6 +471,11 @@ void TreeProbability::findBestSplitValueUnordered(size_t nodeID, size_t varID, s } size_t n_left = num_samples_node - n_right; + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + double decrease; if (splitrule == HELLINGER) { // TPR is number of outcome 1s in one node / total number of 1s @@ -512,15 +531,19 @@ bool TreeProbability::findBestSplitExtraTrees(size_t nodeID, std::vector ++class_counts[sample_classID]; } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { - findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } else { - findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, - best_varID, best_decrease); + // Stop early if no split posssible + if (num_samples_node >= 2 * min_bucket) { + + // For all possible split variables + for (auto& varID : possible_split_varIDs) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { + findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } else { + findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, + best_varID, best_decrease); + } } } @@ -614,6 +637,11 @@ void TreeProbability::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, continue; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right[i] < min_bucket) { + continue; + } + // Sum of squares double sum_left = 0; double sum_right = 0; @@ -720,6 +748,11 @@ void TreeProbability::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_ } size_t n_left = num_samples_node - n_right; + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + // Sum of squares double sum_left = 0; double sum_right = 0; diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index 812adbbf0..640395a6f 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -141,26 +141,30 @@ bool TreeRegression::findBestSplit(size_t nodeID, std::vector& possible_ sum_node += data->get_y(sampleID, 0); } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { + // Stop early if no split posssible + if (num_samples_node >= 2 * min_bucket) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { + // For all possible split variables + for (auto& varID : possible_split_varIDs) { - // Use memory saving method if option set - if (memory_saving_splitting) { - findBestSplitValueSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); - } else { - // Use faster method for both cases - double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); - if (q < Q_THRESHOLD) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { + + // Use memory saving method if option set + if (memory_saving_splitting) { findBestSplitValueSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } else { - findBestSplitValueLargeQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); + // Use faster method for both cases + double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); + if (q < Q_THRESHOLD) { + findBestSplitValueSmallQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); + } else { + findBestSplitValueLargeQ(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); + } } + } else { + findBestSplitValueUnordered(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } - } else { - findBestSplitValueUnordered(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); } } @@ -243,6 +247,11 @@ void TreeRegression::findBestSplitValueSmallQ(size_t nodeID, size_t varID, doubl break; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + double sum_right = sum_node - sum_left; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right; @@ -300,6 +309,11 @@ void TreeRegression::findBestSplitValueLargeQ(size_t nodeID, size_t varID, doubl break; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + double sum_right = sum_node - sum_left; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right; @@ -377,6 +391,11 @@ void TreeRegression::findBestSplitValueUnordered(size_t nodeID, size_t varID, do } size_t n_left = num_samples_node - n_right; + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + // Sum of squares double sum_left = sum_node - sum_right; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right; @@ -510,15 +529,19 @@ bool TreeRegression::findBestSplitExtraTrees(size_t nodeID, std::vector& sum_node += data->get_y(sampleID, 0); } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { + // Stop early if no split posssible + if (num_samples_node >= 2 * min_bucket) { + + // For all possible split variables + for (auto& varID : possible_split_varIDs) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { - findBestSplitValueExtraTrees(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); - } else { - findBestSplitValueExtraTreesUnordered(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, - best_decrease); + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { + findBestSplitValueExtraTrees(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); + } else { + findBestSplitValueExtraTreesUnordered(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, + best_decrease); + } } } @@ -611,6 +634,11 @@ void TreeRegression::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, d continue; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right[i] < min_bucket) { + continue; + } + double sum_right = sums_right[i]; double sum_left = sum_node - sum_right; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right[i]; @@ -706,6 +734,11 @@ void TreeRegression::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t } size_t n_left = num_samples_node - n_right; + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right < min_bucket) { + continue; + } + // Sum of squares double sum_left = sum_node - sum_right; double decrease = sum_left * sum_left / (double) n_left + sum_right * sum_right / (double) n_right; @@ -736,9 +769,13 @@ bool TreeRegression::findBestSplitBeta(size_t nodeID, std::vector& possi sum_node += data->get_y(sampleID, 0); } - // For all possible split variables find best split value - for (auto& varID : possible_split_varIDs) { - findBestSplitValueBeta(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); + // Stop early if no split posssible + if (num_samples_node >= 2 * min_bucket) { + + // For all possible split variables find best split value + for (auto& varID : possible_split_varIDs) { + findBestSplitValueBeta(nodeID, varID, sum_node, num_samples_node, best_value, best_varID, best_decrease); + } } // Stop if no good split found @@ -820,6 +857,11 @@ void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double continue; } + // Stop if minimal bucket size reached + if (n_left < min_bucket || n_right[i] < min_bucket) { + continue; + } + // Compute mean double sum_right = sums_right[i]; double mean_right = sum_right / (double) n_right[i]; diff --git a/src/TreeSurvival.cpp b/src/TreeSurvival.cpp index 0690aa665..d31d989ce 100644 --- a/src/TreeSurvival.cpp +++ b/src/TreeSurvival.cpp @@ -127,14 +127,14 @@ bool TreeSurvival::findBestSplit(size_t nodeID, std::vector& possible_sp computeDeathCounts(nodeID); - // Stop if maximum node size or depth reached (will check again for each child node) + // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { computeSurvival(nodeID); return true; } // Stop early if no split posssible - if (num_samples_node >= 2 * min_node_size) { + if (num_samples_node >= 2 * min_bucket) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -382,9 +382,9 @@ void TreeSurvival::findBestSplitValueLogRank(size_t nodeID, size_t varID, double double numerator = 0; double denominator_squared = 0; - // Stop if minimal node size reached + // Stop if minimal bucket size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child[i]; - if (num_samples_right_child[i] < min_node_size || num_samples_left_child < min_node_size) { + if (num_samples_right_child[i] < min_bucket || num_samples_left_child < min_bucket) { continue; } @@ -488,9 +488,9 @@ void TreeSurvival::findBestSplitValueLogRankUnordered(size_t nodeID, size_t varI } - // Stop if minimal node size reached + // Stop if minimal bucket size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child; - if (num_samples_right_child < min_node_size || num_samples_left_child < min_node_size) { + if (num_samples_right_child < min_bucket || num_samples_left_child < min_bucket) { continue; } @@ -579,9 +579,9 @@ void TreeSurvival::findBestSplitValueAUC(size_t nodeID, size_t varID, double& be } for (size_t i = 0; i < num_splits; ++i) { - // Do not consider this split point if fewer than min_node_size samples in one node + // Do not consider this split point if fewer than min_bucket samples in one node size_t num_samples_right_child = num_node_samples - num_samples_left_child[i]; - if (num_samples_left_child[i] < min_node_size || num_samples_right_child < min_node_size) { + if (num_samples_left_child[i] < min_bucket || num_samples_right_child < min_bucket) { continue; } else { double auc = fabs((num_count[i] / 2) / num_total[i] - 0.5); @@ -678,14 +678,14 @@ bool TreeSurvival::findBestSplitExtraTrees(size_t nodeID, std::vector& p computeDeathCounts(nodeID); - // Stop if maximum node size or depth reached (will check again for each child node) + // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { computeSurvival(nodeID); return true; } // Stop early if no split posssible - if (num_samples_node >= 2 * min_node_size) { + if (num_samples_node >= 2 * min_bucket) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -764,7 +764,7 @@ void TreeSurvival::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, dou // Stop if minimal node size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child[i]; - if (num_samples_right_child[i] < min_node_size || num_samples_left_child < min_node_size) { + if (num_samples_right_child[i] < min_bucket || num_samples_left_child < min_bucket) { continue; } @@ -893,7 +893,7 @@ void TreeSurvival::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t v // Stop if minimal node size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child; - if (num_samples_right_child < min_node_size || num_samples_left_child < min_node_size) { + if (num_samples_right_child < min_bucket || num_samples_left_child < min_bucket) { continue; } diff --git a/src/globals.h b/src/globals.h index b794273be..3b6a6ba5b 100644 --- a/src/globals.h +++ b/src/globals.h @@ -87,6 +87,8 @@ const uint DEFAULT_MIN_NODE_SIZE_CLASSIFICATION = 1; const uint DEFAULT_MIN_NODE_SIZE_REGRESSION = 5; const uint DEFAULT_MIN_NODE_SIZE_SURVIVAL = 3; const uint DEFAULT_MIN_NODE_SIZE_PROBABILITY = 10; +const uint DEFAULT_MIN_BUCKET = 1; +const uint DEFAULT_MIN_BUCKET_SURVIVAL = 3; const SplitRule DEFAULT_SPLITRULE = LOGRANK; const double DEFAULT_ALPHA = 0.5; diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index 991fc9609..bc4790f32 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -50,7 +50,7 @@ using namespace ranger; // [[Rcpp::export]] Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, - bool write_forest, uint importance_mode_r, uint min_node_size, + bool write_forest, uint importance_mode_r, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, @@ -149,7 +149,7 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM // Init Ranger forest->initR(std::move(data), mtry, num_trees, verbose_out, seed, num_threads, - importance_mode, min_node_size, split_select_weights, always_split_variable_names, + importance_mode, min_node_size, min_bucket, split_select_weights, always_split_variable_names, prediction_mode, sample_with_replacement, unordered_variable_names, save_memory, splitrule, case_weights, inbag, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth, regularization_factor, regularization_usedepth); diff --git a/tests/testthat/test_ranger.R b/tests/testthat/test_ranger.R index f0e3d56dc..0f2c0118a 100644 --- a/tests/testthat/test_ranger.R +++ b/tests/testthat/test_ranger.R @@ -348,3 +348,37 @@ test_that("mtry function error halts the ranger function", { ranger(Species ~ ., data = iris, mtry = function(n) stop("this is some error")), "mtry function evaluation resulted in an error.") }) + +test_that("min.bucket creates nodes of correct size", { + + # Size 2 + rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, + min.bucket = 2, keep.inbag = TRUE) + pred <- predict(rf, iris, type = "terminalNodes")$prediction + inbag <- sapply(rf$inbag.counts, function(x) x == 1) + smallest_node <- min(sapply(1:ncol(pred), function(i) { + min(table(pred[inbag[, i], i])) + })) + expect_gte(smallest_node, 2) + + # Size 10 + rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, + min.bucket = 10, keep.inbag = TRUE) + pred <- predict(rf, iris, type = "terminalNodes")$prediction + inbag <- sapply(rf$inbag.counts, function(x) x == 1) + smallest_node <- min(sapply(1:ncol(pred), function(i) { + min(table(pred[inbag[, i], i])) + })) + expect_gte(smallest_node, 10) + + # Random size + min.bucket <- round(runif(1, 1, 40)) + rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, + min.bucket = min.bucket, keep.inbag = TRUE) + pred <- predict(rf, iris, type = "terminalNodes")$prediction + inbag <- sapply(rf$inbag.counts, function(x) x == 1) + smallest_node <- min(sapply(1:ncol(pred), function(i) { + min(table(pred[inbag[, i], i])) + })) + expect_gte(smallest_node, min.bucket) +}) From d98279c7020c552a3ee80d9886a91e1b771c7369 Mon Sep 17 00:00:00 2001 From: Marras Antoine Date: Sun, 27 Nov 2022 14:34:48 +0100 Subject: [PATCH 048/111] fixed typo sepErat* to sepArat* --- README.md | 2 +- src/Data.cpp | 10 +++++----- src/Data.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d31e62821..da686e2d2 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ In the C++ version type ./ranger --help ``` -for a list of commands. First you need a training dataset in a file. This file should contain one header line with variable names and one line with variable values per sample (numeric only). Variable names must not contain any whitespace, comma or semicolon. Values can be seperated by whitespace, comma or semicolon but can not be mixed in one file. A typical call of ranger would be for example +for a list of commands. First you need a training dataset in a file. This file should contain one header line with variable names and one line with variable values per sample (numeric only). Variable names must not contain any whitespace, comma or semicolon. Values can be separated by whitespace, comma or semicolon but can not be mixed in one file. A typical call of ranger would be for example ```bash ./ranger --verbose --file data.dat --depvarname Species --treetype 1 --ntree 1000 --nthreads 4 diff --git a/src/Data.cpp b/src/Data.cpp index c3f0a7161..0363e7fd9 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -63,11 +63,11 @@ bool Data::loadFromFile(std::string filename, std::vector& dependen input_file.close(); input_file.open(filename); - // Check if comma, semicolon or whitespace seperated + // Check if comma, semicolon or whitespace separated std::string header_line; getline(input_file, header_line); - // Find out if comma, semicolon or whitespace seperated and call appropriate method + // Find out if comma, semicolon or whitespace separated and call appropriate method if (header_line.find(',') != std::string::npos) { result = loadFromFileOther(input_file, header_line, dependent_variable_names, ','); } else if (header_line.find(';') != std::string::npos) { @@ -150,7 +150,7 @@ bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_ } bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, - std::vector& dependent_variable_names, char seperator) { + std::vector& dependent_variable_names, char separator) { size_t num_dependent_variables = dependent_variable_names.size(); std::vector dependent_varIDs; @@ -160,7 +160,7 @@ bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, std::string header_token; std::stringstream header_line_stream(header_line); size_t col = 0; - while (getline(header_line_stream, header_token, seperator)) { + while (getline(header_line_stream, header_token, separator)) { bool is_dependent_var = false; for (size_t i = 0; i < dependent_variable_names.size(); ++i) { if (header_token == dependent_variable_names[i]) { @@ -187,7 +187,7 @@ bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, double token; std::stringstream line_stream(line); size_t column = 0; - while (getline(line_stream, token_string, seperator)) { + while (getline(line_stream, token_string, separator)) { std::stringstream token_stream(token_string); readFromStream(token_stream, token); diff --git a/src/Data.h b/src/Data.h index 7c0ebc993..c58e5ec66 100644 --- a/src/Data.h +++ b/src/Data.h @@ -47,7 +47,7 @@ class Data { bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line, std::vector& dependent_variable_names); bool loadFromFileOther(std::ifstream& input_file, std::string header_line, - std::vector& dependent_variable_names, char seperator); + std::vector& dependent_variable_names, char separator); void getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const; From 66aa1044dfa305dd87be457d097c60e2cd6a7da5 Mon Sep 17 00:00:00 2001 From: stephematician Date: Mon, 27 Feb 2023 22:41:22 +1100 Subject: [PATCH 049/111] Fixes #656 --- src/Forest.cpp | 4 ++-- tests/testthat/test_splitweights.R | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/Forest.cpp b/src/Forest.cpp index 7d46a64c1..a06123fd4 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -967,8 +967,8 @@ void Forest::setSplitWeightVector(std::vector>& split_select // Deterministic varIDs std::vector is_deterministic(num_weights, false); - for (size_t i = 0; i < deterministic_varIDs.size(); ++i) { - is_deterministic[i] = true; + for (auto it = deterministic_varIDs.cbegin(); it != deterministic_varIDs.cend(); ++it) { + is_deterministic[*it] = true; } // Split up in deterministic and weighted variables, ignore zero weights diff --git a/tests/testthat/test_splitweights.R b/tests/testthat/test_splitweights.R index 8e753b16f..caf3c2eba 100644 --- a/tests/testthat/test_splitweights.R +++ b/tests/testthat/test_splitweights.R @@ -45,3 +45,15 @@ test_that("Tree-wise split select weights work with 0s", { }) expect_true(all(selected_correctly)) }) + +test_that("always split variables respect split select weights", { + iris_vars <- setdiff(names(iris), 'Species') + n_vars <- length(iris_vars) + last_var <- iris_vars[n_vars] + with_last_zero <- c(rep(1, n_vars-1), 0) + expect_silent( + ranger(Species ~ ., iris, num.trees=5, + always.split.variables=last_var, mtry=n_vars-1, + split.select.weights=with_last_zero) + ) +}) From 940516cb63fe5122a43a9de36e49db5caafc417a Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Mar 2023 06:23:49 +0100 Subject: [PATCH 050/111] switch to C++14 standard --- DESCRIPTION | 4 ++-- NEWS.md | 2 ++ README.md | 2 +- cpp_version/CMakeLists.txt | 12 ++++++------ cpp_version/src/main.cpp | 10 +++++----- cpp_version/src/version.h | 2 +- cpp_version/test/CMakeLists.txt | 2 +- src/AAA_check_cpp11.cpp | 4 ++-- src/Forest.cpp | 6 +++--- src/ForestClassification.cpp | 6 +++--- src/ForestProbability.cpp | 6 +++--- src/ForestRegression.cpp | 6 +++--- src/ForestSurvival.cpp | 6 +++--- src/Makevars | 2 -- src/Makevars.win | 2 -- src/rangerCpp.cpp | 14 +++++++------- src/utility.h | 31 ------------------------------- 17 files changed, 42 insertions(+), 75 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0d7062f58..3ce3a4b90 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.14.1 -Date: 2022-06-17 +Version: 0.14.2 +Date: 2023-03-03 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS.md b/NEWS.md index 759ccedf5..3f7a2e077 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# ranger 0.14.2 +* Switch to C++14 standard # ranger 0.14.1 * URL fix for CRAN diff --git a/README.md b/README.md index da686e2d2..cd0026393 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ devtools::install_github("imbs-hl/ranger") ``` #### Standalone C++ version -To install the C++ version of ranger in Linux or Mac OS X you will need a compiler supporting C++11 (i.e. gcc >= 4.7 or Clang >= 3.0) and Cmake. To build start a terminal from the ranger main directory and run the following commands +To install the C++ version of ranger in Linux or Mac OS X you will need a compiler supporting C++14 (i.e. gcc >= 5 or Clang >= 3.4) and Cmake. To build start a terminal from the ranger main directory and run the following commands ```bash cd cpp_version diff --git a/cpp_version/CMakeLists.txt b/cpp_version/CMakeLists.txt index 397bb533c..c9bdd380d 100644 --- a/cpp_version/CMakeLists.txt +++ b/cpp_version/CMakeLists.txt @@ -2,20 +2,20 @@ project(ranger) cmake_minimum_required(VERSION 2.0) ## ======================================================================================## -## Check for C++11. For GCC this is >=4.7 +## Check for C++14. For GCC this is >=4.7 ## ======================================================================================## include(CheckCXXCompilerFlag) -CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) -if(COMPILER_SUPPORTS_CXX11) - message("Compiler with C++11 support found.") +CHECK_CXX_COMPILER_FLAG("-std=c++14" COMPILER_SUPPORTS_CXX14) +if(COMPILER_SUPPORTS_CXX14) + message("Compiler with C++14 support found.") else() - message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler, i.e. gcc >= 4.7 or Clang >= 3.0.") + message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} has no C++14 support. Please use a different C++ compiler, i.e. gcc >= 5 or Clang >= 3.4.") endif() ## ======================================================================================## ## Compiler flags ## ======================================================================================## -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++14") ## ======================================================================================## ## In Clang phtread flag only for compiler, not for linker. For diff --git a/cpp_version/src/main.cpp b/cpp_version/src/main.cpp index 7b63d02b0..73c2c8adf 100644 --- a/cpp_version/src/main.cpp +++ b/cpp_version/src/main.cpp @@ -33,19 +33,19 @@ void run_ranger(const ArgumentHandler& arg_handler, std::ostream& verbose_out) { switch (arg_handler.treetype) { case TREE_CLASSIFICATION: if (arg_handler.probability) { - forest = make_unique(); + forest = std::make_unique(); } else { - forest = make_unique(); + forest = std::make_unique(); } break; case TREE_REGRESSION: - forest = make_unique(); + forest = std::make_unique(); break; case TREE_SURVIVAL: - forest = make_unique(); + forest = std::make_unique(); break; case TREE_PROBABILITY: - forest = make_unique(); + forest = std::make_unique(); break; } diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 1d643dcd8..d59bf779c 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.14.1" +#define RANGER_VERSION "0.14.2" #endif diff --git a/cpp_version/test/CMakeLists.txt b/cpp_version/test/CMakeLists.txt index cab760a67..82ea6d51c 100644 --- a/cpp_version/test/CMakeLists.txt +++ b/cpp_version/test/CMakeLists.txt @@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 2.0) ## ======================================================================================## ## Compiler flags ## ======================================================================================## -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++14") set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS -pthread) ## ======================================================================================## diff --git a/src/AAA_check_cpp11.cpp b/src/AAA_check_cpp11.cpp index 13effbd07..5df6e69b0 100644 --- a/src/AAA_check_cpp11.cpp +++ b/src/AAA_check_cpp11.cpp @@ -1,6 +1,6 @@ #ifndef WIN_R_BUILD -#if __cplusplus < 201103L -#error Error: ranger requires a real C++11 compiler, e.g., gcc >= 4.7 or Clang >= 3.0. You probably have to update your C++ compiler. +#if __cplusplus < 201402L +#error Error: ranger requires a C++14 compiler, e.g., gcc >= 5 or Clang >= 3.4. You probably have to update your C++ compiler. #endif #endif diff --git a/src/Forest.cpp b/src/Forest.cpp index 7d46a64c1..73085af4b 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -925,13 +925,13 @@ std::unique_ptr Forest::loadDataFromFile(const std::string& data_path) { std::unique_ptr result { }; switch (memory_mode) { case MEM_DOUBLE: - result = make_unique(); + result = std::make_unique(); break; case MEM_FLOAT: - result = make_unique(); + result = std::make_unique(); break; case MEM_CHAR: - result = make_unique(); + result = std::make_unique(); break; } diff --git a/src/ForestClassification.cpp b/src/ForestClassification.cpp index 18ec35fee..786439e88 100644 --- a/src/ForestClassification.cpp +++ b/src/ForestClassification.cpp @@ -37,7 +37,7 @@ void ForestClassification::loadForest(size_t num_trees, trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( - make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], + std::make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], &this->class_values, &response_classIDs)); } @@ -101,7 +101,7 @@ void ForestClassification::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( - make_unique(&class_values, &response_classIDs, &sampleIDs_per_class, &class_weights)); + std::make_unique(&class_values, &response_classIDs, &sampleIDs_per_class, &class_weights)); } } @@ -314,7 +314,7 @@ void ForestClassification::loadFromFileInternal(std::ifstream& infile) { // Create tree trees.push_back( - make_unique(child_nodeIDs, split_varIDs, split_values, &class_values, &response_classIDs)); + std::make_unique(child_nodeIDs, split_varIDs, split_values, &class_values, &response_classIDs)); } } diff --git a/src/ForestProbability.cpp b/src/ForestProbability.cpp index 9a17bbad9..ef73e30de 100644 --- a/src/ForestProbability.cpp +++ b/src/ForestProbability.cpp @@ -32,7 +32,7 @@ void ForestProbability::loadForest(size_t num_trees, trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( - make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], + std::make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], &this->class_values, &response_classIDs, forest_terminal_class_counts[i])); } @@ -106,7 +106,7 @@ void ForestProbability::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( - make_unique(&class_values, &response_classIDs, &sampleIDs_per_class, &class_weights)); + std::make_unique(&class_values, &response_classIDs, &sampleIDs_per_class, &class_weights)); } } @@ -321,7 +321,7 @@ void ForestProbability::loadFromFileInternal(std::ifstream& infile) { // Create tree trees.push_back( - make_unique(child_nodeIDs, split_varIDs, split_values, &class_values, &response_classIDs, + std::make_unique(child_nodeIDs, split_varIDs, split_values, &class_values, &response_classIDs, terminal_class_counts)); } } diff --git a/src/ForestRegression.cpp b/src/ForestRegression.cpp index b721cb7ed..4f8b20853 100644 --- a/src/ForestRegression.cpp +++ b/src/ForestRegression.cpp @@ -32,7 +32,7 @@ void ForestRegression::loadForest(size_t num_trees, trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back( - make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i])); + std::make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i])); } // Create thread ranges @@ -71,7 +71,7 @@ void ForestRegression::initInternal() { void ForestRegression::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { - trees.push_back(make_unique()); + trees.push_back(std::make_unique()); } } @@ -241,7 +241,7 @@ void ForestRegression::loadFromFileInternal(std::ifstream& infile) { } // Create tree - trees.push_back(make_unique(child_nodeIDs, split_varIDs, split_values)); + trees.push_back(std::make_unique(child_nodeIDs, split_varIDs, split_values)); } } diff --git a/src/ForestSurvival.cpp b/src/ForestSurvival.cpp index f5416c937..89eecb2b8 100644 --- a/src/ForestSurvival.cpp +++ b/src/ForestSurvival.cpp @@ -34,7 +34,7 @@ void ForestSurvival::loadForest(size_t num_trees, std::vector(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], + std::make_unique(forest_child_nodeIDs[i], forest_split_varIDs[i], forest_split_values[i], forest_chf[i], &this->unique_timepoints, &response_timepointIDs)); } @@ -95,7 +95,7 @@ void ForestSurvival::initInternal() { void ForestSurvival::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { - trees.push_back(make_unique(&unique_timepoints, &response_timepointIDs)); + trees.push_back(std::make_unique(&unique_timepoints, &response_timepointIDs)); } } @@ -316,7 +316,7 @@ void ForestSurvival::loadFromFileInternal(std::ifstream& infile) { // Create tree trees.push_back( - make_unique(child_nodeIDs, split_varIDs, split_values, chf, &unique_timepoints, + std::make_unique(child_nodeIDs, split_varIDs, split_values, chf, &unique_timepoints, &response_timepointIDs)); } } diff --git a/src/Makevars b/src/Makevars index 89fd506db..a77f23960 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,2 @@ -## Use c++11 -CXX_STD = CXX11 PKG_CPPFLAGS = -DR_BUILD diff --git a/src/Makevars.win b/src/Makevars.win index 1c1fd15b4..a6af4dd1c 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,4 +1,2 @@ -## Use c++11 -CXX_STD = CXX11 PKG_CPPFLAGS = -DR_BUILD -DWIN_R_BUILD diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index 991fc9609..ceb1a798e 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -108,9 +108,9 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM // Initialize data if (use_sparse_data) { - data = make_unique(sparse_x, input_y, variable_names, num_rows, num_cols); + data = std::make_unique(sparse_x, input_y, variable_names, num_rows, num_cols); } else { - data = make_unique(input_x, input_y, variable_names, num_rows, num_cols); + data = std::make_unique(input_x, input_y, variable_names, num_rows, num_cols); } // If there is snp data, add it @@ -127,19 +127,19 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM switch (treetype) { case TREE_CLASSIFICATION: if (probability) { - forest = make_unique(); + forest = std::make_unique(); } else { - forest = make_unique(); + forest = std::make_unique(); } break; case TREE_REGRESSION: - forest = make_unique(); + forest = std::make_unique(); break; case TREE_SURVIVAL: - forest = make_unique(); + forest = std::make_unique(); break; case TREE_PROBABILITY: - forest = make_unique(); + forest = std::make_unique(); break; } diff --git a/src/utility.h b/src/utility.h index ac809b1a6..1460c3023 100644 --- a/src/utility.h +++ b/src/utility.h @@ -536,37 +536,6 @@ inline bool checkInterrupt() { } #endif -// Provide make_unique (not available in C++11) -namespace detail { - -template struct _Unique_if { - typedef std::unique_ptr _Single_object; -}; - -template struct _Unique_if { - typedef std::unique_ptr _Unknown_bound; -}; - -template struct _Unique_if { - typedef void _Known_bound; -}; - -} // namespace detail - -template -typename detail::_Unique_if::_Single_object make_unique(Args&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); -} - -template -typename detail::_Unique_if::_Unknown_bound make_unique(size_t n) { - typedef typename std::remove_extent::type U; - return std::unique_ptr(new U[n]()); -} - -template -typename detail::_Unique_if::_Known_bound make_unique(Args&&...) = delete; - } // namespace ranger From 3214fc6f3d097482e359db8550d298319621aa09 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Mar 2023 06:57:52 +0100 Subject: [PATCH 051/111] update GH actions workflow --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 2ceeced34..1d19e9544 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -29,7 +29,7 @@ jobs: R_KEEP_PKG_SOURCE: yes steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: r-lib/actions/setup-pandoc@v2 From 5062f545a29f03c2077b432039b131d193cd3604 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Mar 2023 07:31:25 +0100 Subject: [PATCH 052/111] drop support of old RTools without C++11/14 --- .Rbuildignore | 1 + .gitignore | 3 +- DESCRIPTION | 2 +- src/Forest.cpp | 103 ------------------------------------------------- src/Forest.h | 8 ---- src/globals.h | 9 ----- 6 files changed, 4 insertions(+), 122 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 8d8c64dce..aafcb993b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -14,3 +14,4 @@ ^ranger.*\.tar\.gz$ ^ranger.*\.tgz$ revdep/* +^\.vscode$ diff --git a/.gitignore b/.gitignore index 47ec7a386..702d4b071 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ ranger.Rcheck/ docs ranger*.tar.gz ranger*.tgz -revdep/* \ No newline at end of file +revdep/* +.vscode \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 3ce3a4b90..96e01f677 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,6 +19,6 @@ Suggests: survival, testthat Encoding: UTF-8 -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.2 URL: https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/src/Forest.cpp b/src/Forest.cpp index 73085af4b..e34736b97 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -15,10 +15,8 @@ #include #include #include -#ifndef OLD_WIN_R_BUILD #include #include -#endif #include "utility.h" #include "Forest.h" @@ -199,11 +197,7 @@ void Forest::init(std::unique_ptr input_data, uint mtry, std::string outpu // Set number of threads if (num_threads == DEFAULT_NUM_THREADS) { -#ifdef OLD_WIN_R_BUILD - this->num_threads = 1; -#else this->num_threads = std::thread::hardware_concurrency(); -#endif } else { this->num_threads = num_threads; } @@ -486,18 +480,6 @@ void Forest::grow() { variable_importance.resize(num_independent_variables, 0); // Grow trees in multiple threads -#ifdef OLD_WIN_R_BUILD - // #nocov start - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->grow(&variable_importance); - progress++; - showProgress("Growing trees..", start_time, lap_time); - } - // #nocov end -#else progress = 0; #ifdef R_BUILD aborted = false; @@ -538,8 +520,6 @@ void Forest::grow() { variable_importance_threads.clear(); } -#endif - // Divide importance by number of trees if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { for (auto& v : variable_importance) { @@ -551,24 +531,6 @@ void Forest::grow() { void Forest::predict() { // Predict trees in multiple threads and join the threads with the main thread -#ifdef OLD_WIN_R_BUILD - // #nocov start - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->predict(data.get(), false); - progress++; - showProgress("Predicting..", start_time, lap_time); - } - - // For all samples get tree predictions - allocatePredictMemory(); - for (size_t sample_idx = 0; sample_idx < data->getNumRows(); ++sample_idx) { - predictInternal(sample_idx); - } - // #nocov end -#else progress = 0; #ifdef R_BUILD aborted = false; @@ -604,24 +566,11 @@ void Forest::predict() { throw std::runtime_error("User interrupt."); } #endif -#endif } void Forest::computePredictionError() { // Predict trees in multiple threads -#ifdef OLD_WIN_R_BUILD - // #nocov start - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->predict(data.get(), true); - progress++; - showProgress("Predicting..", start_time, lap_time); - } - // #nocov end -#else std::vector threads; threads.reserve(num_threads); progress = 0; @@ -637,7 +586,6 @@ void Forest::computePredictionError() { if (aborted_threads > 0) { throw std::runtime_error("User interrupt."); } -#endif #endif // Call special function for subclasses @@ -647,30 +595,6 @@ void Forest::computePredictionError() { void Forest::computePermutationImportance() { // Compute tree permutation importance in multiple threads -#ifdef OLD_WIN_R_BUILD - // #nocov start - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - - // Initialize importance and variance - variable_importance.resize(num_independent_variables, 0); - std::vector variance; - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - variance.resize(num_independent_variables, 0); - } - if (importance_mode == IMP_PERM_CASEWISE) { - variable_importance_casewise.resize(num_independent_variables * num_samples, 0); - } - - // Compute importance - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->computePermutationImportance(variable_importance, variance, variable_importance_casewise); - progress++; - showProgress("Computing permutation importance..", start_time, lap_time); - } - -#else progress = 0; #ifdef R_BUILD aborted = false; @@ -739,7 +663,6 @@ void Forest::computePermutationImportance() { } variable_importance_casewise_threads.clear(); } -#endif for (size_t i = 0; i < variable_importance.size(); ++i) { variable_importance[i] /= num_trees; @@ -760,7 +683,6 @@ void Forest::computePermutationImportance() { } } -#ifndef OLD_WIN_R_BUILD void Forest::growTreesInThread(uint thread_idx, std::vector* variable_importance) { if (thread_ranges.size() > thread_idx + 1) { for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { @@ -857,7 +779,6 @@ void Forest::computeTreePermutationImportanceInThread(uint thread_idx, std::vect } } } -#endif // #nocov start void Forest::loadFromFile(std::string filename) { @@ -1027,29 +948,6 @@ void Forest::setAlwaysSplitVariables(const std::vector& always_spli } } -#ifdef OLD_WIN_R_BUILD -// #nocov start -void Forest::showProgress(std::string operation, clock_t start_time, clock_t& lap_time) { - - // Check for user interrupt - if (checkInterrupt()) { - throw std::runtime_error("User interrupt."); - } - - double elapsed_time = (clock() - lap_time) / CLOCKS_PER_SEC; - if (elapsed_time > STATUS_INTERVAL) { - double relative_progress = (double) progress / (double) num_trees; - double time_from_start = (clock() - start_time) / CLOCKS_PER_SEC; - uint remaining_time = (1 / relative_progress - 1) * time_from_start; - if (verbose_out) { - *verbose_out << operation << " Progress: " << round(100 * relative_progress) - << "%. Estimated remaining time: " << beautifyTime(remaining_time) << "." << std::endl; - } - lap_time = clock(); - } -} -// #nocov end -#else void Forest::showProgress(std::string operation, size_t max_progress) { using std::chrono::steady_clock; using std::chrono::duration_cast; @@ -1086,6 +984,5 @@ void Forest::showProgress(std::string operation, size_t max_progress) { } } } -#endif } // namespace ranger diff --git a/src/Forest.h b/src/Forest.h index a1275b43e..5e9817195 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -17,12 +17,10 @@ #include #include #include -#ifndef OLD_WIN_R_BUILD #include #include #include #include -#endif #include "globals.h" #include "Tree.h" @@ -176,11 +174,7 @@ class Forest { void setAlwaysSplitVariables(const std::vector& always_split_variable_names); // Show progress every few seconds -#ifdef OLD_WIN_R_BUILD - void showProgress(std::string operation, clock_t start_time, clock_t& lap_time); -#else void showProgress(std::string operation, size_t max_progress); -#endif // Verbose output stream, cout if verbose==true, logfile if not std::ostream* verbose_out; @@ -212,10 +206,8 @@ class Forest { // Multithreading uint num_threads; std::vector thread_ranges; -#ifndef OLD_WIN_R_BUILD std::mutex mutex; std::condition_variable condition_variable; -#endif std::vector> trees; std::unique_ptr data; diff --git a/src/globals.h b/src/globals.h index b794273be..310a6a539 100644 --- a/src/globals.h +++ b/src/globals.h @@ -18,15 +18,6 @@ namespace ranger { #define M_PI 3.14159265358979323846 #endif -// Old/new Win build -#ifdef WIN_R_BUILD - #if __cplusplus < 201103L - #define OLD_WIN_R_BUILD - #else - #define NEW_WIN_R_BUILD - #endif -#endif - typedef unsigned int uint; // Tree types, probability is not selected by ID From 6e480e9a9433c3beaa99f3f89756ecb58a2b3202 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Mar 2023 10:03:13 +0100 Subject: [PATCH 053/111] require C++14 to build on Windows --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 96e01f677..dba0e7700 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,6 +18,7 @@ Depends: R (>= 3.1) Suggests: survival, testthat +SystemRequirements: C++14 Encoding: UTF-8 RoxygenNote: 7.2.2 URL: https://github.com/imbs-hl/ranger From 1fbb790461b711fbac8e9984322120de99cd9b57 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Mar 2023 10:55:41 +0100 Subject: [PATCH 054/111] switch to C++17 to avoid R-devel NOTE --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index dba0e7700..0aa794c6a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,7 @@ Depends: R (>= 3.1) Suggests: survival, testthat -SystemRequirements: C++14 +SystemRequirements: C++17 Encoding: UTF-8 RoxygenNote: 7.2.2 URL: https://github.com/imbs-hl/ranger From e939ade7f547f67d10e4b38741a195f3e8e94259 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Mar 2023 12:20:04 +0100 Subject: [PATCH 055/111] update NEWS --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 3f7a2e077..5bcf535ff 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # ranger 0.14.2 * Switch to C++14 standard +* Fix a bug with always.split.variables selecting the wrong variables # ranger 0.14.1 * URL fix for CRAN From e322f84f5d01deaafca894f91d2d72f1ecdd5221 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Mar 2023 16:44:34 +0100 Subject: [PATCH 056/111] fix NEWS update --- NEWS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS b/NEWS index d42d48b8a..697fe7c9f 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,8 @@ ##### Version 0.14.2 +* Switch to C++14 standard * Add min.bucket parameter to restrict terminal node size +* Fix a bug with always.split.variables selecting the wrong variables ##### Version 0.14.0 * Faster permutation variable importance for high dimensional data (thanks to Roman Hornung) From e9d28f81e0ca9956769181de5fe7a075c4562c1d Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 22 Mar 2023 21:43:10 +0100 Subject: [PATCH 057/111] remove C++17 requirement --- DESCRIPTION | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0aa794c6a..09bb848be 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests Version: 0.14.2 -Date: 2023-03-03 +Date: 2023-03-22 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high @@ -18,7 +18,6 @@ Depends: R (>= 3.1) Suggests: survival, testthat -SystemRequirements: C++17 Encoding: UTF-8 RoxygenNote: 7.2.2 URL: https://github.com/imbs-hl/ranger From e86cfab098323ddc99f7d8bc4fffb2c784b4c340 Mon Sep 17 00:00:00 2001 From: "Stanley E. Lazic" Date: Thu, 30 Mar 2023 12:43:26 +0900 Subject: [PATCH 058/111] Update ranger.R --- R/ranger.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index b4d161226..06fa3a2da 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -61,9 +61,8 @@ ##' Note that the factors are only reordered once and not again in each split. ##' ##' The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance. -##' It is a modified version of the method by Sandri & Zuccolotto (2008), which is faster and more memory efficient. -##' See Nembrini et al. (2018) for details. -##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. +##' It is a modified version of the method by Sandri & Zuccolotto (2008), which is faster and more memory efficient. See Nembrini et al. (2018) for details. +##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. It should not be used when making predictions since the feature permuation step reduces performance (a warning is raised when predicting on new data). ##' ##' Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. ##' If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. From 51b2404f8623619b59b18b39fb875d1ff1bb115c Mon Sep 17 00:00:00 2001 From: "Stanley E. Lazic" Date: Thu, 30 Mar 2023 12:47:13 +0900 Subject: [PATCH 059/111] note about impurity_corrected Updated details to warn about predictions with importance=impurity_corrected --- R/ranger.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ranger.R b/R/ranger.R index 06fa3a2da..b910497d7 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -62,7 +62,7 @@ ##' ##' The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance. ##' It is a modified version of the method by Sandri & Zuccolotto (2008), which is faster and more memory efficient. See Nembrini et al. (2018) for details. -##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. It should not be used when making predictions since the feature permuation step reduces performance (a warning is raised when predicting on new data). +##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. It should not be used when making predictions since the feature permutation step reduces performance (a warning is raised when predicting on new data). ##' ##' Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. ##' If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. From b200cda1d2b0e4313c64a7b368dd0d05234760d0 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 31 Mar 2023 06:52:59 +0200 Subject: [PATCH 060/111] doc changes and Rd file --- R/ranger.R | 3 ++- man/ranger.Rd | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index b910497d7..54a18342f 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -62,7 +62,8 @@ ##' ##' The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance. ##' It is a modified version of the method by Sandri & Zuccolotto (2008), which is faster and more memory efficient. See Nembrini et al. (2018) for details. -##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. It should not be used when making predictions since the feature permutation step reduces performance (a warning is raised when predicting on new data). +##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. +##' We recommend not to use the 'impurity_corrected' importance when making predictions since the feature permutation step might reduce predictive performance (a warning is raised when predicting on new data). ##' ##' Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. ##' If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. diff --git a/man/ranger.Rd b/man/ranger.Rd index 2e41c79e4..63d6d395e 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -187,9 +187,9 @@ The use of 'order' is recommended, as it computationally fast and can handle an Note that the factors are only reordered once and not again in each split. The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance. -It is a modified version of the method by Sandri & Zuccolotto (2008), which is faster and more memory efficient. -See Nembrini et al. (2018) for details. -This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. +It is a modified version of the method by Sandri & Zuccolotto (2008), which is faster and more memory efficient. See Nembrini et al. (2018) for details. +This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. +We recommend not to use the 'impurity_corrected' importance when making predictions since the feature permutation step might reduce predictive performance (a warning is raised when predicting on new data). Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. From 38560f224df5391fecca41458ca7f055d212a749 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 31 Mar 2023 07:09:00 +0200 Subject: [PATCH 061/111] prepare new CRAN release --- DESCRIPTION | 4 ++-- NEWS | 2 +- NEWS.md | 4 ++++ cpp_version/src/version.h | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 09bb848be..35d9548e1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.14.2 -Date: 2023-03-22 +Version: 0.15.0 +Date: 2023-03-31 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS b/NEWS index 697fe7c9f..9f5f3f6c1 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,5 @@ -##### Version 0.14.2 +##### Version 0.15.0 * Switch to C++14 standard * Add min.bucket parameter to restrict terminal node size * Fix a bug with always.split.variables selecting the wrong variables diff --git a/NEWS.md b/NEWS.md index aedd0cb20..a182f5837 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ + +# ranger 0.15.0 +* New CRAN version + # ranger 0.14.2 * Switch to C++14 standard * Add min.bucket parameter to restrict terminal node size diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index d59bf779c..e346e8c9c 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.14.2" +#define RANGER_VERSION "0.15.0" #endif From 62bbe2007c9016700fc18f3be42395cf3bffa76f Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 3 Apr 2023 07:03:02 +0200 Subject: [PATCH 062/111] fix quantreg for factor columns --- DESCRIPTION | 4 ++-- R/predict.R | 2 +- cpp_version/src/version.h | 2 +- tests/testthat/test_quantreg.R | 10 ++++++++++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 35d9548e1..66b49a458 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.15.0 -Date: 2023-03-31 +Version: 0.15.1 +Date: 2023-04-03 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/R/predict.R b/R/predict.R index 3d3948a8b..9dc0aee9e 100644 --- a/R/predict.R +++ b/R/predict.R @@ -164,7 +164,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, } ## Recode factors if forest grown 'order' mode - if (!is.null(forest$covariate.levels) && !all(sapply(forest$covariate.levels, is.null))) { + if (!is.null(forest$covariate.levels) && !all(sapply(forest$covariate.levels, is.null)) && !is.matrix(x)) { x <- mapply(function(xx, yy) { if(is.null(yy)) { xx diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index e346e8c9c..25370c118 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.15.0" +#define RANGER_VERSION "0.15.1" #endif diff --git a/tests/testthat/test_quantreg.R b/tests/testthat/test_quantreg.R index b7f4481e6..b2f6ffab6 100644 --- a/tests/testthat/test_quantreg.R +++ b/tests/testthat/test_quantreg.R @@ -50,3 +50,13 @@ test_that("User specified function works as expected", { pred_sample <- predict(rf.quant, mtcars[27:32, ], type = "quantiles", what = function(x) sample(x, 10, replace = TRUE)) expect_equal(dim(pred_sample$predictions), c(pred_sample$num.samples, 10)) }) + +test_that("Working for factor variables", { + expect_silent(rf <- ranger(Sepal.Length ~ ., iris, quantreg = TRUE)) + expect_silent(predict(rf, iris, type = "quantiles")) +}) + +test_that("Working for unordered factor variables", { + expect_silent(rf <- ranger(Sepal.Length ~ ., iris, quantreg = TRUE, respect.unordered.factors = "order")) + expect_silent(predict(rf, iris, type = "quantiles")) +}) From a795d409a72e447d5490b4038da26d494148290b Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Mon, 17 Apr 2023 11:29:32 +1000 Subject: [PATCH 063/111] Update predict.R --- R/predict.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/predict.R b/R/predict.R index 9dc0aee9e..ef1397e73 100644 --- a/R/predict.R +++ b/R/predict.R @@ -516,7 +516,7 @@ predict.ranger <- function(object, data = NULL, predict.all = FALSE, node.values <- object$random.node.values.oob } else { ## New data prediction - terminal.nodes <- predict(object, data, type = "terminalNodes")$predictions + 1 + terminal.nodes <- predict(object, data, num.threads = num.threads, type = "terminalNodes")$predictions + 1 node.values <- 0 * terminal.nodes for (tree in 1:num.trees) { node.values[, tree] <- object$random.node.values[terminal.nodes[, tree], tree] From 47d07b69641b8b2d9b1532fe8d8d09126092f78c Mon Sep 17 00:00:00 2001 From: olivroy <52606734+olivroy@users.noreply.github.com> Date: Wed, 28 Jun 2023 16:38:08 -0400 Subject: [PATCH 064/111] Add pkgdown link to DESCRIPTION --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 66b49a458..b8facaf3b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,5 +20,5 @@ Suggests: testthat Encoding: UTF-8 RoxygenNote: 7.2.2 -URL: https://github.com/imbs-hl/ranger +URL: http://imbs-hl.github.io/ranger/, https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues From 4ff16e9175695b039638f35cfe5b470006c37337 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Sun, 2 Jul 2023 00:04:33 +0200 Subject: [PATCH 065/111] fix memory error for always.split.variables with high mtry values --- DESCRIPTION | 6 +++--- NEWS.md | 6 ++++++ cpp_version/src/version.h | 2 +- src/Forest.cpp | 3 +++ src/utility.cpp | 2 +- tests/testthat/test_splitweights.R | 3 +++ 6 files changed, 17 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 66b49a458..a3a258659 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.15.1 -Date: 2023-04-03 +Version: 0.15.2 +Date: 2023-07-02 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high @@ -19,6 +19,6 @@ Suggests: survival, testthat Encoding: UTF-8 -RoxygenNote: 7.2.2 +RoxygenNote: 7.2.3 URL: https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/NEWS.md b/NEWS.md index a182f5837..3652517f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,10 @@ +# ranger 0.15.2 +* Fix memory error for always.split.variables in certain settings + +# ranger 0.15.1 +* Fix quantile regression for factor variables in "order" mode + # ranger 0.15.0 * New CRAN version diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 25370c118..d84865fdd 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.15.1" +#define RANGER_VERSION "0.15.2" #endif diff --git a/src/Forest.cpp b/src/Forest.cpp index dfde5c118..81f8baf4e 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -947,6 +947,9 @@ void Forest::setAlwaysSplitVariables(const std::vector& always_spli deterministic_varIDs.push_back(k + num_independent_variables); } } + + // Sort in reverse order for removing with erase later + std::sort(deterministic_varIDs.rbegin(), deterministic_varIDs.rend()); } void Forest::showProgress(std::string operation, size_t max_progress) { diff --git a/src/utility.cpp b/src/utility.cpp index d27b2fa23..9e7853b24 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -172,7 +172,7 @@ void drawWithoutReplacementFisherYates(std::vector& result, std::mt19937 // Skip indices for (size_t i = 0; i < skip.size(); ++i) { - result.erase(result.begin() + skip[skip.size() - 1 - i]); + result.erase(result.begin() + skip[i]); } // Draw without replacement using Fisher Yates algorithm diff --git a/tests/testthat/test_splitweights.R b/tests/testthat/test_splitweights.R index caf3c2eba..0572de439 100644 --- a/tests/testthat/test_splitweights.R +++ b/tests/testthat/test_splitweights.R @@ -31,6 +31,8 @@ test_that("Tree-wise split select weights work", { test_that("always split variables work", { expect_silent(ranger(Species ~ ., iris, num.trees = 10, always.split.variables = c("Petal.Length", "Petal.Width"), mtry = 2)) + expect_silent(ranger(Species ~ ., iris, num.trees = 10, + always.split.variables = c("Petal.Width", "Petal.Length"), mtry = 2)) expect_silent(ranger(dependent.variable.name = "Species", data = iris, num.trees = 10, always.split.variables = c("Petal.Length", "Petal.Width"), mtry = 2)) }) @@ -57,3 +59,4 @@ test_that("always split variables respect split select weights", { split.select.weights=with_last_zero) ) }) + From e13b9e14c2fe7ab82eda5c73db24a02689cc792b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 19 Jul 2023 06:37:42 +0200 Subject: [PATCH 066/111] update pkgdown workflow --- .github/workflows/pkgdown.yaml | 62 +++++++++++++++++----------------- DESCRIPTION | 3 +- _pkgdown.yml | 27 ++------------- 3 files changed, 36 insertions(+), 56 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 1abece4b9..ed7650c73 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,48 +1,48 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: - - main - - master + branches: [main, master] + pull_request: + branches: [main, master] + release: + types: [published] + workflow_dispatch: name: pkgdown jobs: pkgdown: - runs-on: macOS-latest + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-pandoc@v2 - - uses: r-lib/actions/setup-pandoc@v1 - - - name: Query dependencies - run: | - install.packages('remotes') - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") - shell: Rscript {0} + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true - - name: Restore R package cache - uses: actions/cache@v2 + - uses: r-lib/actions/setup-r-dependencies@v2 with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + extra-packages: any::pkgdown, local::. + needs: website - - name: Install dependencies - run: | - remotes::install_deps(dependencies = TRUE) - install.packages("pkgdown", type = "binary") + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) shell: Rscript {0} - - name: Install package - run: R CMD INSTALL . - - - name: Deploy package - run: | - git config --local user.email "actions@github.com" - git config --local user.name "GitHub Actions" - Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4.4.1 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/DESCRIPTION b/DESCRIPTION index f92a82a97..9ecb61bf3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,5 +20,6 @@ Suggests: testthat Encoding: UTF-8 RoxygenNote: 7.2.3 -URL: http://imbs-hl.github.io/ranger/, https://github.com/imbs-hl/ranger +URL: http://imbs-hl.github.io/ranger/, + https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/_pkgdown.yml b/_pkgdown.yml index ef571c884..aaa296d3e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,25 +1,4 @@ -development: - mode: auto +url: http://imbs-hl.github.io/ranger/ +template: + bootstrap: 5 -url: https://imbs-hl.github.io/ranger - -home: - title: A fast implementation of Random Forests - description: > - A fast implementation of Random Forests, particularly suited for high - dimensional data. Ensembles of classification, regression, survival and - probability prediction trees are supported. Data from genome-wide association - studies can be analyzed efficiently. In addition to data frames, datasets of - class 'gwaa.data' (R package 'GenABEL') and 'dgCMatrix' (R package 'Matrix') - can be directly analyzed. - -authors: - Marvin N. Wright: - href: http://wrig.de - -news: - one_page: true - cran_dates: true - # releases: - # - text: "ranger 0.12.4" - # href: link From d1ecf1edcce2686de4b64a428a9a6f5e5b360084 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 19 Jul 2023 06:59:26 +0200 Subject: [PATCH 067/111] fix min bucket for C++ version --- DESCRIPTION | 4 ++-- NEWS.md | 3 +++ cpp_version/src/main.cpp | 2 +- cpp_version/src/utility/ArgumentHandler.cpp | 21 ++++++++++++++++++--- cpp_version/src/utility/ArgumentHandler.h | 1 + cpp_version/src/version.h | 2 +- 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9ecb61bf3..6ca004781 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.15.2 -Date: 2023-07-02 +Version: 0.15.3 +Date: 2023-07-19 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS.md b/NEWS.md index 3652517f6..22a8a8d64 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.15.3 +* Fix min bucket option in C++ version + # ranger 0.15.2 * Fix memory error for always.split.variables in certain settings diff --git a/cpp_version/src/main.cpp b/cpp_version/src/main.cpp index 73c2c8adf..b4b9d0f0d 100644 --- a/cpp_version/src/main.cpp +++ b/cpp_version/src/main.cpp @@ -52,7 +52,7 @@ void run_ranger(const ArgumentHandler& arg_handler, std::ostream& verbose_out) { // Call Ranger forest->initCpp(arg_handler.depvarname, arg_handler.memmode, arg_handler.file, arg_handler.mtry, arg_handler.outprefix, arg_handler.ntree, &verbose_out, arg_handler.seed, arg_handler.nthreads, - arg_handler.predict, arg_handler.impmeasure, arg_handler.targetpartitionsize, arg_handler.splitweights, + arg_handler.predict, arg_handler.impmeasure, arg_handler.targetpartitionsize, arg_handler.minbucket, arg_handler.splitweights, arg_handler.alwayssplitvars, arg_handler.statusvarname, arg_handler.replace, arg_handler.catvars, arg_handler.savemem, arg_handler.splitrule, arg_handler.caseweights, arg_handler.predall, arg_handler.fraction, arg_handler.alpha, arg_handler.minprop, arg_handler.holdout, arg_handler.predictiontype, diff --git a/cpp_version/src/utility/ArgumentHandler.cpp b/cpp_version/src/utility/ArgumentHandler.cpp index 761b23448..1da4743f4 100644 --- a/cpp_version/src/utility/ArgumentHandler.cpp +++ b/cpp_version/src/utility/ArgumentHandler.cpp @@ -23,7 +23,7 @@ ArgumentHandler::ArgumentHandler(int argc, char **argv) : caseweights(""), depvarname(""), fraction(0), holdout(false), memmode(MEM_DOUBLE), savemem(false), skipoob(false), predict( ""), predictiontype(DEFAULT_PREDICTIONTYPE), randomsplits(DEFAULT_NUM_RANDOM_SPLITS), splitweights(""), nthreads( DEFAULT_NUM_THREADS), predall(false), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), maxdepth( - DEFAULT_MAXDEPTH), file(""), impmeasure(DEFAULT_IMPORTANCE_MODE), targetpartitionsize(0), mtry(0), outprefix( + DEFAULT_MAXDEPTH), file(""), impmeasure(DEFAULT_IMPORTANCE_MODE), targetpartitionsize(0), minbucket(0), mtry(0), outprefix( "ranger_out"), probability(false), splitrule(DEFAULT_SPLITRULE), statusvarname(""), ntree(DEFAULT_NUM_TREE), replace( true), verbose(false), write(false), treetype(TREE_CLASSIFICATION), seed(0), usedepth(false) { this->argc = argc; @@ -33,7 +33,7 @@ ArgumentHandler::ArgumentHandler(int argc, char **argv) : int ArgumentHandler::processArguments() { // short options - char const *short_options = "A:C:D:F:HM:NOP:Q:R:S:U:XZa:b:c:d:f:hi:j:kl:m:o:pr:s:t:uvwy:z:"; + char const *short_options = "A:C:D:F:HM:NOP:Q:R:S:U:XZa:b:c:d:f:hi:j:kl:m:n:o:pr:s:t:uvwy:z:"; // long options: longname, no/optional/required argument?, flag(not used!), shortname const struct option long_options[] = { @@ -53,7 +53,6 @@ int ArgumentHandler::processArguments() { { "nthreads", required_argument, 0, 'U'}, { "predall", no_argument, 0, 'X'}, { "version", no_argument, 0, 'Z'}, - { "alpha", required_argument, 0, 'a'}, { "minprop", required_argument, 0, 'b'}, { "catvars", required_argument, 0, 'c'}, @@ -65,6 +64,7 @@ int ArgumentHandler::processArguments() { { "usedepth", no_argument, 0, 'k'}, { "targetpartitionsize", required_argument, 0, 'l'}, { "mtry", required_argument, 0, 'm'}, + { "minbucket", required_argument, 0, 'n'}, { "outprefix", required_argument, 0, 'o'}, { "probability", no_argument, 0, 'p'}, { "splitrule", required_argument, 0, 'r'}, @@ -306,6 +306,20 @@ int ArgumentHandler::processArguments() { } break; + case 'n': + try { + int temp = std::stoi(optarg); + if (temp < 1) { + throw std::runtime_error(""); + } else { + minbucket = temp; + } + } catch (...) { + throw std::runtime_error( + "Illegal argument for option 'minbucket'. Please give a positive integer. See '--help' for details."); + } + break; + case 'o': outprefix = optarg; break; @@ -573,6 +587,7 @@ void ArgumentHandler::displayHelp() { std::cout << " " << " (Default: sqrt(p) with p = number of independent variables)" << std::endl; std::cout << " " << "--targetpartitionsize N Set minimal node size to N." << std::endl; + std::cout << " " << "--minbucket N Set min bucket size to N." << std::endl; std::cout << " " << " For Classification and Regression growing is stopped if a node reaches a size smaller than N." << std::endl; diff --git a/cpp_version/src/utility/ArgumentHandler.h b/cpp_version/src/utility/ArgumentHandler.h index 00d4b64a4..d6964093f 100644 --- a/cpp_version/src/utility/ArgumentHandler.h +++ b/cpp_version/src/utility/ArgumentHandler.h @@ -71,6 +71,7 @@ class ArgumentHandler { std::string file; ImportanceMode impmeasure; uint targetpartitionsize; + uint minbucket; uint mtry; std::string outprefix; bool probability; diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index d84865fdd..781d7076a 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.15.2" +#define RANGER_VERSION "0.15.3" #endif From 9acb4ee80bdffd0e8560cc62e84476dfb5099ac3 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 19 Jul 2023 07:35:45 +0200 Subject: [PATCH 068/111] fix treeInfo for probability prediction when y is numeric --- R/treeInfo.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/treeInfo.R b/R/treeInfo.R index 467b7eb7d..f0a54d362 100644 --- a/R/treeInfo.R +++ b/R/treeInfo.R @@ -128,8 +128,12 @@ treeInfo <- function(object, tree = 1) { } else if (forest$treetype == "Probability estimation") { predictions <- matrix(nrow = nrow(result), ncol = length(forest$class.values)) predictions[result$terminal, ] <- do.call(rbind, forest$terminal.class.counts[[tree]]) - colnames(predictions) <- forest$levels[forest$class.values] - predictions <- predictions[, forest$levels[sort(forest$class.values)], drop = FALSE] + if (!is.null(forest$levels)) { + colnames(predictions) <- forest$levels[forest$class.values] + predictions <- predictions[, forest$levels[sort(forest$class.values)], drop = FALSE] + } else { + colnames(predictions) <- forest$class.values + } colnames(predictions) <- paste0("pred.", colnames(predictions)) result <- data.frame(result, predictions) } else if (forest$treetype == "Survival") { From 1fb5791230fde341cb4f2eabe4d62a9a497b7cbf Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 16 Aug 2023 06:52:17 +0200 Subject: [PATCH 069/111] update cmake version --- cpp_version/CMakeLists.txt | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/cpp_version/CMakeLists.txt b/cpp_version/CMakeLists.txt index c9bdd380d..604c2d51c 100644 --- a/cpp_version/CMakeLists.txt +++ b/cpp_version/CMakeLists.txt @@ -1,33 +1,23 @@ +cmake_minimum_required(VERSION 3.1) project(ranger) -cmake_minimum_required(VERSION 2.0) ## ======================================================================================## -## Check for C++14. For GCC this is >=4.7 +## Check for C++14 ## ======================================================================================## -include(CheckCXXCompilerFlag) -CHECK_CXX_COMPILER_FLAG("-std=c++14" COMPILER_SUPPORTS_CXX14) -if(COMPILER_SUPPORTS_CXX14) - message("Compiler with C++14 support found.") -else() - message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} has no C++14 support. Please use a different C++ compiler, i.e. gcc >= 5 or Clang >= 3.4.") -endif() +set (CMAKE_CXX_STANDARD 14) +set (CMAKE_CXX_STANDARD_REQUIRED TRUE) ## ======================================================================================## ## Compiler flags ## ======================================================================================## -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++14") +add_compile_options(-Wall) ## ======================================================================================## -## In Clang phtread flag only for compiler, not for linker. For -## windows use static linking +## Threads ## ======================================================================================## -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS -pthread) -elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lpthread -static") -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") -endif() +set(CMAKE_THREAD_PREFER_PTHREAD TRUE) +set(THREADS_PREFER_PTHREAD_FLAG TRUE) +find_package(Threads REQUIRED) ## ======================================================================================## ## Subdirectories and source files @@ -56,4 +46,4 @@ ADD_CUSTOM_TARGET(release ## Executable ## ======================================================================================## add_executable(ranger ${SOURCES}) - +target_link_libraries(ranger Threads::Threads) From 77f296a860bb8a96b3fb2325e618660a132395e9 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 16 Aug 2023 09:11:44 +0200 Subject: [PATCH 070/111] static linking in cross compile toolchain file --- cpp_version/cross_compile/toolchain_win32.cmake | 3 +++ cpp_version/cross_compile/toolchain_win64.cmake | 3 +++ 2 files changed, 6 insertions(+) diff --git a/cpp_version/cross_compile/toolchain_win32.cmake b/cpp_version/cross_compile/toolchain_win32.cmake index c07342fd3..61731bf84 100644 --- a/cpp_version/cross_compile/toolchain_win32.cmake +++ b/cpp_version/cross_compile/toolchain_win32.cmake @@ -15,3 +15,6 @@ SET(CMAKE_FIND_ROOT_PATH /usr/i686-w64-mingw32) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# static linking +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static") diff --git a/cpp_version/cross_compile/toolchain_win64.cmake b/cpp_version/cross_compile/toolchain_win64.cmake index 535e5b716..4f0270e56 100644 --- a/cpp_version/cross_compile/toolchain_win64.cmake +++ b/cpp_version/cross_compile/toolchain_win64.cmake @@ -15,3 +15,6 @@ SET(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# static linking +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static") \ No newline at end of file From 777cc0b9a87d85931ac739c0820301ee47746641 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 16 Aug 2023 09:20:39 +0200 Subject: [PATCH 071/111] add cpp build action --- .github/workflows/cpp-build.yaml | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/cpp-build.yaml diff --git a/.github/workflows/cpp-build.yaml b/.github/workflows/cpp-build.yaml new file mode 100644 index 000000000..0f77b144b --- /dev/null +++ b/.github/workflows/cpp-build.yaml @@ -0,0 +1,39 @@ + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: CPP-build + +jobs: + linux: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build + run: | + sudo apt-get install cmake + mkdir build && pushd build + cmake -DCMAKE_INSTALL_PREFIX=~/.local ../cpp_version + make install -j2 && ctest -j2 --output-on-failure + macos: + runs-on: macos-latest + steps: + - uses: actions/checkout@v2 + - name: Build + run: | + mkdir build && pushd build + cmake -DCMAKE_INSTALL_PREFIX=~/.local ../cpp_version + make install -j2 && ctest --output-on-failure + windows: + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + - name: Build + shell: cmd + run: | + cmake -DCMAKE_INSTALL_PREFIX=%cd:\=/%/install cpp_version + cmake --build . --config Release --target install + ctest -C Release --output-on-failure \ No newline at end of file From 07a13084ea5d3a0db7493bc4cfdfeade93848c9a Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 16 Aug 2023 09:54:46 +0200 Subject: [PATCH 072/111] add help test --- .github/workflows/cpp-build.yaml | 14 +++++++------- cpp_version/CMakeLists.txt | 6 ++++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/workflows/cpp-build.yaml b/.github/workflows/cpp-build.yaml index 0f77b144b..e768a68f6 100644 --- a/.github/workflows/cpp-build.yaml +++ b/.github/workflows/cpp-build.yaml @@ -16,8 +16,8 @@ jobs: run: | sudo apt-get install cmake mkdir build && pushd build - cmake -DCMAKE_INSTALL_PREFIX=~/.local ../cpp_version - make install -j2 && ctest -j2 --output-on-failure + cmake ../cpp_version + make && ctest --output-on-failure macos: runs-on: macos-latest steps: @@ -25,8 +25,8 @@ jobs: - name: Build run: | mkdir build && pushd build - cmake -DCMAKE_INSTALL_PREFIX=~/.local ../cpp_version - make install -j2 && ctest --output-on-failure + cmake ../cpp_version + make && ctest --output-on-failure windows: runs-on: windows-latest steps: @@ -34,6 +34,6 @@ jobs: - name: Build shell: cmd run: | - cmake -DCMAKE_INSTALL_PREFIX=%cd:\=/%/install cpp_version - cmake --build . --config Release --target install - ctest -C Release --output-on-failure \ No newline at end of file + cmake cpp_version + cmake --build . + ctest --output-on-failure \ No newline at end of file diff --git a/cpp_version/CMakeLists.txt b/cpp_version/CMakeLists.txt index 604c2d51c..6e222c091 100644 --- a/cpp_version/CMakeLists.txt +++ b/cpp_version/CMakeLists.txt @@ -47,3 +47,9 @@ ADD_CUSTOM_TARGET(release ## ======================================================================================## add_executable(ranger ${SOURCES}) target_link_libraries(ranger Threads::Threads) + +## ======================================================================================## +## Test +## ======================================================================================## +enable_testing() +add_test(NAME help COMMAND ranger --help) From a2720708f4867ea35f428e28e063e3d80cacae50 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 16 Aug 2023 09:58:47 +0200 Subject: [PATCH 073/111] fix windows test build --- .github/workflows/cpp-build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cpp-build.yaml b/.github/workflows/cpp-build.yaml index e768a68f6..8a713a9b5 100644 --- a/.github/workflows/cpp-build.yaml +++ b/.github/workflows/cpp-build.yaml @@ -35,5 +35,5 @@ jobs: shell: cmd run: | cmake cpp_version - cmake --build . - ctest --output-on-failure \ No newline at end of file + cmake --build . --config Release + ctest -C Release --output-on-failure \ No newline at end of file From b18316eb32a2bbca313ae6b271d38e75afe86864 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 16 Aug 2023 11:14:17 +0200 Subject: [PATCH 074/111] remove windows cpp build for now --- .github/workflows/cpp-build.yaml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/cpp-build.yaml b/.github/workflows/cpp-build.yaml index 8a713a9b5..5a0962f4f 100644 --- a/.github/workflows/cpp-build.yaml +++ b/.github/workflows/cpp-build.yaml @@ -27,13 +27,3 @@ jobs: mkdir build && pushd build cmake ../cpp_version make && ctest --output-on-failure - windows: - runs-on: windows-latest - steps: - - uses: actions/checkout@v2 - - name: Build - shell: cmd - run: | - cmake cpp_version - cmake --build . --config Release - ctest -C Release --output-on-failure \ No newline at end of file From a252d353c9dabb9177ef580a0343685bae9021bb Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 12 Sep 2023 22:08:00 +0200 Subject: [PATCH 075/111] add node.stats option --- DESCRIPTION | 4 +-- NEWS.md | 3 +++ R/RcppExports.R | 4 +-- R/predict.R | 4 ++- R/ranger.R | 6 +++-- cpp_version/src/version.h | 2 +- man/ranger.Rd | 3 +++ src/Forest.cpp | 11 +++++---- src/Forest.h | 22 +++++++++++++++-- src/RcppExports.cpp | 9 ++++--- src/Tree.cpp | 15 +++++++++--- src/Tree.h | 14 ++++++++++- src/TreeClassification.cpp | 13 ++++++++-- src/TreeProbability.cpp | 21 +++++++++++++--- src/TreeProbability.h | 2 +- src/TreeRegression.cpp | 10 +++++++- src/TreeSurvival.cpp | 50 +++++++++++++++++++++++++++++--------- src/TreeSurvival.h | 2 +- src/rangerCpp.cpp | 16 ++++++++++-- 19 files changed, 164 insertions(+), 47 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6ca004781..20ef02c46 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.15.3 -Date: 2023-07-19 +Version: 0.15.4 +Date: 2023-09-12 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS.md b/NEWS.md index 22a8a8d64..0795b254e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.15.4 +* Add node.stats option to save node statistics of all nodes + # ranger 0.15.3 * Fix min bucket option in C++ version diff --git a/R/RcppExports.R b/R/RcppExports.R index 19cc8e8ac..1de8f9d4c 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) { - .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) +rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats) { + .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats) } numSmaller <- function(values, reference) { diff --git a/R/predict.R b/R/predict.R index ef1397e73..93284617c 100644 --- a/R/predict.R +++ b/R/predict.R @@ -250,6 +250,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, regularization.factor <- c(0, 0) use.regularization.factor <- FALSE regularization.usedepth <- FALSE + node.stats <- FALSE ## Use sparse matrix if (inherits(x, "dgCMatrix")) { @@ -273,7 +274,8 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, - regularization.factor, use.regularization.factor, regularization.usedepth) + regularization.factor, use.regularization.factor, regularization.usedepth, + node.stats) if (length(result) == 0) { stop("User interrupt or internal error.") diff --git a/R/ranger.R b/R/ranger.R index 54a18342f..f888ab347 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -119,6 +119,7 @@ ##' @param num.threads Number of threads. Default is number of CPUs available. ##' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. ##' @param verbose Show computation status and estimated runtime. +##' @param node.stats Save node statistics. Set to \code{TRUE} to save prediction and number of observations for each node. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. ##' @param dependent.variable.name Name of dependent variable, needed if no formula given. For survival forests this is the time variable. ##' @param status.variable.name Name of status variable, only applicable to survival data and needed if no formula given. Use 1 for event and 0 for censoring. @@ -224,7 +225,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, keep.inbag = FALSE, inbag = NULL, holdout = FALSE, quantreg = FALSE, oob.error = TRUE, num.threads = NULL, save.memory = FALSE, - verbose = TRUE, seed = NULL, + verbose = TRUE, node.stats = FALSE, seed = NULL, dependent.variable.name = NULL, status.variable.name = NULL, classification = NULL, x = NULL, y = NULL, ...) { @@ -873,7 +874,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, - regularization.factor, use.regularization.factor, regularization.usedepth) + regularization.factor, use.regularization.factor, regularization.usedepth, + node.stats) if (length(result) == 0) { stop("User interrupt or internal error.") diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 781d7076a..2f82b8ee6 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.15.3" +#define RANGER_VERSION "0.15.4" #endif diff --git a/man/ranger.Rd b/man/ranger.Rd index 63d6d395e..3f47b7191 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -38,6 +38,7 @@ ranger( num.threads = NULL, save.memory = FALSE, verbose = TRUE, + node.stats = FALSE, seed = NULL, dependent.variable.name = NULL, status.variable.name = NULL, @@ -114,6 +115,8 @@ ranger( \item{verbose}{Show computation status and estimated runtime.} +\item{node.stats}{Save node statistics. Set to \code{TRUE} to save prediction and number of observations for each node.} + \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed.} \item{dependent.variable.name}{Name of dependent variable, needed if no formula given. For survival forests this is the time variable.} diff --git a/src/Forest.cpp b/src/Forest.cpp index 81f8baf4e..8c7a42422 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -81,7 +81,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode init(loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, min_node_size, min_bucket, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, - false, max_depth, regularization_factor, regularization_usedepth); + false, max_depth, regularization_factor, regularization_usedepth, false); if (prediction_mode) { loadFromFile(load_forest_filename); @@ -140,7 +140,7 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::vector>& manual_inbag, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, const std::vector& regularization_factor, - bool regularization_usedepth) { + bool regularization_usedepth, bool node_stats) { this->verbose_out = verbose_out; @@ -148,7 +148,7 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, init(std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, min_bucket, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth, - regularization_factor, regularization_usedepth); + regularization_factor, regularization_usedepth, node_stats); // Set variables to be always considered for splitting if (!always_split_variable_names.empty()) { @@ -182,7 +182,7 @@ void Forest::init(std::unique_ptr input_data, uint mtry, std::string outpu bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, - uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth) { + uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth, bool node_stats) { // Initialize data with memmode this->data = std::move(input_data); @@ -224,6 +224,7 @@ void Forest::init(std::unique_ptr input_data, uint mtry, std::string outpu this->max_depth = max_depth; this->regularization_factor = regularization_factor; this->regularization_usedepth = regularization_usedepth; + this->save_node_stats = node_stats; // Set number of samples and variables num_samples = data->getNumRows(); @@ -474,7 +475,7 @@ void Forest::grow() { trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights, importance_mode, min_node_size, min_bucket, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, num_random_splits, max_depth, - ®ularization_factor, regularization_usedepth, &split_varIDs_used); + ®ularization_factor, regularization_usedepth, &split_varIDs_used, save_node_stats); } // Init variable importance diff --git a/src/Forest.h b/src/Forest.h index c44ab6558..73d782dcc 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -55,13 +55,15 @@ class Forest { std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth, - const std::vector& regularization_factor, bool regularization_usedepth); + const std::vector& regularization_factor, bool regularization_usedepth, + bool node_stats); void init(std::unique_ptr input_data, uint mtry, std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, - bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); + bool order_snps, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth, + bool node_stats); virtual void initInternal() = 0; // Grow or predict @@ -142,6 +144,21 @@ class Forest { const std::vector>& getSnpOrder() const { return data->getSnpOrder(); } + + std::vector> getNumSamplesNodes() { + std::vector> result; + for (auto& tree : trees) { + result.push_back(tree->getNumSamplesNodes()); + } + return result; + } + std::vector> getNodePredictions() { + std::vector> result; + for (auto& tree : trees) { + result.push_back(tree->getNodePredictions()); + } + return result; + } protected: void grow(); @@ -202,6 +219,7 @@ class Forest { PredictionType prediction_type; uint num_random_splits; uint max_depth; + bool save_node_stats; // MAXSTAT splitrule double alpha; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 65b57caba..4cc76e598 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -13,8 +13,8 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // rangerCpp -Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth); -RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP min_bucketSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP) { +Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, bool node_stats); +RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP min_bucketSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP, SEXP node_statsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -65,7 +65,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::vector& >::type regularization_factor(regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type use_regularization_factor(use_regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type regularization_usedepth(regularization_usedepthSEXP); - rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)); + Rcpp::traits::input_parameter< bool >::type node_stats(node_statsSEXP); + rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats)); return rcpp_result_gen; END_RCPP } @@ -96,7 +97,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 47}, + {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 48}, {"_ranger_numSmaller", (DL_FUNC) &_ranger_numSmaller, 2}, {"_ranger_randomObsNode", (DL_FUNC) &_ranger_randomObsNode, 3}, {NULL, NULL, 0} diff --git a/src/Tree.cpp b/src/Tree.cpp index c6ef6303f..542e540a5 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -18,8 +18,9 @@ namespace ranger { Tree::Tree() : mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), min_bucket(0), deterministic_varIDs(0), split_select_weights(0), case_weights( - 0), manual_inbag(0), oob_sampleIDs(0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth( - false), split_varIDs_used(0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement( + 0), manual_inbag(0), oob_sampleIDs(0), save_node_stats(false), num_samples_nodes(0), node_predictions(0), + holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), + split_varIDs_used(0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement( true), sample_fraction(0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( DEFAULT_MINPROP), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( 0) { @@ -29,7 +30,8 @@ Tree::Tree(std::vector>& child_nodeIDs, std::vector& std::vector& split_values) : mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), min_bucket(0), deterministic_varIDs(0), split_select_weights(0), case_weights( 0), manual_inbag(0), split_varIDs(split_varIDs), split_values(split_values), child_nodeIDs(child_nodeIDs), oob_sampleIDs( - 0), holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), split_varIDs_used( + 0), save_node_stats(false), num_samples_nodes(0), node_predictions(0), + holdout(false), keep_inbag(false), data(0), regularization_factor(0), regularization_usedepth(false), split_varIDs_used( 0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement(true), sample_fraction( 0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( DEFAULT_MINPROP), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID( @@ -41,12 +43,13 @@ void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std: bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, - bool regularization_usedepth, std::vector* split_varIDs_used) { + bool regularization_usedepth, std::vector* split_varIDs_used, bool save_node_stats) { this->data = data; this->mtry = mtry; this->num_samples = num_samples; this->memory_saving_splitting = memory_saving_splitting; + this->save_node_stats = save_node_stats; // Create root node, assign bootstrap sample and oob samples child_nodeIDs.push_back(std::vector()); @@ -384,6 +387,10 @@ void Tree::createEmptyNode() { child_nodeIDs[1].push_back(0); start_pos.push_back(0); end_pos.push_back(0); + + if (save_node_stats) { + num_samples_nodes.push_back(0); + } createEmptyNodeInternal(); } diff --git a/src/Tree.h b/src/Tree.h index 3acbfa20f..3536ce683 100644 --- a/src/Tree.h +++ b/src/Tree.h @@ -41,7 +41,7 @@ class Tree { std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, bool regularization_usedepth, - std::vector* split_varIDs_used); + std::vector* split_varIDs_used, bool save_node_stats); virtual void allocateMemory() = 0; @@ -75,6 +75,13 @@ class Tree { const std::vector& getInbagCounts() const { return inbag_counts; } + + const std::vector& getNumSamplesNodes() const { + return num_samples_nodes; + } + const std::vector& getNodePredictions() const { + return node_predictions; + } protected: void createPossibleSplitVarSubset(std::vector& result); @@ -191,6 +198,11 @@ class Tree { // IDs of OOB individuals, sorted std::vector oob_sampleIDs; + + // Node statistics + bool save_node_stats; + std::vector num_samples_nodes; + std::vector node_predictions; // Holdout mode bool holdout; diff --git a/src/TreeClassification.cpp b/src/TreeClassification.cpp index 7353f47ab..23bd8acfe 100644 --- a/src/TreeClassification.cpp +++ b/src/TreeClassification.cpp @@ -76,8 +76,15 @@ void TreeClassification::appendToFileInternal(std::ofstream& file) { // #nocov s bool TreeClassification::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { - // Stop if maximum node size or depth reached size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; + + // Save node statistics + if (save_node_stats) { + num_samples_nodes[nodeID] = num_samples_node; + node_predictions[nodeID] = estimate(nodeID); + } + + // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { split_values[nodeID] = estimate(nodeID); return true; @@ -117,7 +124,9 @@ bool TreeClassification::splitNodeInternal(size_t nodeID, std::vector& p } void TreeClassification::createEmptyNodeInternal() { - // Empty on purpose + if (save_node_stats) { + node_predictions.push_back(0); + } } double TreeClassification::computePredictionAccuracyInternal(std::vector* prediction_error_casewise) { diff --git a/src/TreeProbability.cpp b/src/TreeProbability.cpp index 5aade041f..f86d99fb1 100644 --- a/src/TreeProbability.cpp +++ b/src/TreeProbability.cpp @@ -80,10 +80,19 @@ void TreeProbability::appendToFileInternal(std::ofstream& file) { // #nocov star bool TreeProbability::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { - // Stop if maximum node size or depth reached size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; - if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { + + // Save node statistics + if (save_node_stats) { + num_samples_nodes[nodeID] = num_samples_node; addToTerminalNodes(nodeID); + } + + // Stop if maximum node size or depth reached + if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { + if (!save_node_stats) { + addToTerminalNodes(nodeID); + } return true; } @@ -100,7 +109,9 @@ bool TreeProbability::splitNodeInternal(size_t nodeID, std::vector& poss pure_value = value; } if (pure) { - addToTerminalNodes(nodeID); + if (!save_node_stats) { + addToTerminalNodes(nodeID); + } return true; } @@ -113,7 +124,9 @@ bool TreeProbability::splitNodeInternal(size_t nodeID, std::vector& poss } if (stop) { - addToTerminalNodes(nodeID); + if (!save_node_stats) { + addToTerminalNodes(nodeID); + } return true; } diff --git a/src/TreeProbability.h b/src/TreeProbability.h index 484a260f5..0bf9d9acf 100644 --- a/src/TreeProbability.h +++ b/src/TreeProbability.h @@ -105,7 +105,7 @@ class TreeProbability: public Tree { const std::vector* response_classIDs; const std::vector>* sampleIDs_per_class; - // Class counts in terminal nodes. Empty for non-terminal nodes. + // Class counts in terminal nodes. Empty for non-terminal nodes (except if save_node_stats). std::vector> terminal_class_counts; // Splitting weights diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index 640395a6f..abb4120d2 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -60,6 +60,12 @@ void TreeRegression::appendToFileInternal(std::ofstream& file) { // #nocov start bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; + + // Save node statistics + if (save_node_stats) { + num_samples_nodes[nodeID] = num_samples_node; + node_predictions[nodeID] = estimate(nodeID); + } // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { @@ -105,7 +111,9 @@ bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possi } void TreeRegression::createEmptyNodeInternal() { - // Empty on purpose + if (save_node_stats) { + node_predictions.push_back(0); + } } double TreeRegression::computePredictionAccuracyInternal(std::vector* prediction_error_casewise) { diff --git a/src/TreeSurvival.cpp b/src/TreeSurvival.cpp index d31d989ce..678522b98 100644 --- a/src/TreeSurvival.cpp +++ b/src/TreeSurvival.cpp @@ -88,6 +88,14 @@ double TreeSurvival::computePredictionAccuracyInternal(std::vector* pred bool TreeSurvival::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { + // Save node statistics + if (save_node_stats) { + size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; + num_samples_nodes[nodeID] = num_samples_node; + computeDeathCounts(nodeID); + computeSurvival(nodeID); + } + // Stop if node is pure bool pure = true; double pure_time = 0; @@ -104,8 +112,10 @@ bool TreeSurvival::splitNodeInternal(size_t nodeID, std::vector& possibl pure_status = status; } if (pure) { - computeDeathCounts(nodeID); - computeSurvival(nodeID); + if (!save_node_stats) { + computeDeathCounts(nodeID); + computeSurvival(nodeID); + } return true; } @@ -125,11 +135,15 @@ bool TreeSurvival::findBestSplit(size_t nodeID, std::vector& possible_sp size_t best_varID = 0; double best_value = 0; - computeDeathCounts(nodeID); + if (!save_node_stats) { + computeDeathCounts(nodeID); + } // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { - computeSurvival(nodeID); + if (!save_node_stats) { + computeSurvival(nodeID); + } return true; } @@ -155,7 +169,9 @@ bool TreeSurvival::findBestSplit(size_t nodeID, std::vector& possible_sp // Stop and save CHF if no good split found (this is terminal node). if (best_decrease < 0) { - computeSurvival(nodeID); + if (!save_node_stats) { + computeSurvival(nodeID); + } return true; } else { // If not terminal node save best values @@ -180,8 +196,10 @@ bool TreeSurvival::findBestSplitMaxstat(size_t nodeID, std::vector& poss // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { - computeDeathCounts(nodeID); - computeSurvival(nodeID); + if (!save_node_stats) { + computeDeathCounts(nodeID); + computeSurvival(nodeID); + } return true; } @@ -281,8 +299,10 @@ bool TreeSurvival::findBestSplitMaxstat(size_t nodeID, std::vector& poss // Stop and save CHF if no good split found (this is terminal node). if (adjusted_best_pvalue > alpha) { - computeDeathCounts(nodeID); - computeSurvival(nodeID); + if (!save_node_stats) { + computeDeathCounts(nodeID); + computeSurvival(nodeID); + } return true; } else { // If not terminal node save best values @@ -676,11 +696,15 @@ bool TreeSurvival::findBestSplitExtraTrees(size_t nodeID, std::vector& p size_t best_varID = 0; double best_value = 0; - computeDeathCounts(nodeID); + if (!save_node_stats) { + computeDeathCounts(nodeID); + } // Stop if maximum node size or depth reached if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { - computeSurvival(nodeID); + if (!save_node_stats) { + computeSurvival(nodeID); + } return true; } @@ -702,7 +726,9 @@ bool TreeSurvival::findBestSplitExtraTrees(size_t nodeID, std::vector& p // Stop and save CHF if no good split found (this is terminal node). if (best_decrease < 0) { - computeSurvival(nodeID); + if (!save_node_stats) { + computeSurvival(nodeID); + } return true; } else { // If not terminal node save best values diff --git a/src/TreeSurvival.h b/src/TreeSurvival.h index 61ce194e9..f27cef3c8 100644 --- a/src/TreeSurvival.h +++ b/src/TreeSurvival.h @@ -102,7 +102,7 @@ class TreeSurvival: public Tree { size_t num_timepoints; const std::vector* response_timepointIDs; - // For all terminal nodes CHF for all unique timepoints. For other nodes empty vector. + // For all terminal nodes CHF for all unique timepoints. For other nodes empty vector (except if save_node_stats). std::vector> chf; // Fields to save to while tree growing diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index e743ca151..538ed5c67 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -61,7 +61,8 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, - std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth) { + std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, + bool node_stats) { Rcpp::List result; @@ -152,7 +153,7 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM importance_mode, min_node_size, min_bucket, split_select_weights, always_split_variable_names, prediction_mode, sample_with_replacement, unordered_variable_names, save_memory, splitrule, case_weights, inbag, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, - order_snps, max_depth, regularization_factor, regularization_usedepth); + order_snps, max_depth, regularization_factor, regularization_usedepth, node_stats); // Load forest object if in prediction mode if (prediction_mode) { @@ -247,6 +248,10 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM forest_object.push_back(forest->getSplitVarIDs(), "split.varIDs"); forest_object.push_back(forest->getSplitValues(), "split.values"); forest_object.push_back(forest->getIsOrderedVariable(), "is.ordered"); + + if (node_stats) { + forest_object.push_back(forest->getNumSamplesNodes(), "num.samples.nodes"); + } if (snp_data.nrow() > 1 && order_snps) { // Exclude permuted SNPs (if any) @@ -257,6 +262,13 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM if (treetype == TREE_CLASSIFICATION) { auto& temp = dynamic_cast(*forest); forest_object.push_back(temp.getClassValues(), "class.values"); + if (node_stats) { + forest_object.push_back(forest->getNodePredictions(), "node.predictions"); + } + } else if (treetype == TREE_REGRESSION) { + if (node_stats) { + forest_object.push_back(forest->getNodePredictions(), "node.predictions"); + } } else if (treetype == TREE_PROBABILITY) { auto& temp = dynamic_cast(*forest); forest_object.push_back(temp.getClassValues(), "class.values"); From 2854bb5d6c605c5cac793f49bf1a1ee48584dff1 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 12 Sep 2023 22:08:12 +0200 Subject: [PATCH 076/111] add tests for nodestats option --- tests/testthat/test_nodestats.R | 107 ++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/testthat/test_nodestats.R diff --git a/tests/testthat/test_nodestats.R b/tests/testthat/test_nodestats.R new file mode 100644 index 000000000..3470b9271 --- /dev/null +++ b/tests/testthat/test_nodestats.R @@ -0,0 +1,107 @@ +## Tests for node statistics + +library(ranger) +library(survival) +context("ranger_nodestats") + +test_that("if node.stats FALSE, no nodestats saved, classification", { + rf <- ranger(Species ~ ., iris, num.trees = 5) + expect_null(rf$forest$num.samples.nodes) + expect_null(rf$forest$node.predictions) +}) + +test_that("if node.stats FALSE, no nodestats saved, probability", { + rf <- ranger(Species ~ ., iris, num.trees = 5, probability = TRUE) + expect_null(rf$forest$num.samples.nodes) + expect_null(rf$forest$node.predictions) + expect_length(rf$forest$terminal.class.counts[[1]][[1]], 0) +}) + +test_that("if node.stats FALSE, no nodestats saved, regression", { + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5) + expect_null(rf$forest$num.samples.nodes) + expect_null(rf$forest$node.predictions) +}) + +test_that("if node.stats FALSE, no nodestats saved, survival", { + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) + expect_null(rf$forest$num.samples.nodes) + expect_null(rf$forest$node.predictions) + expect_length(rf$forest$chf[[1]][[1]], 0) +}) + +test_that("if node.stats TRUE, nodestats saved, classification", { + rf <- ranger(Species ~ ., iris, num.trees = 5, node.stats = TRUE) + + expect_is(rf$forest$num.samples.nodes, "list") + expect_length(rf$forest$num.samples.nodes, rf$num.trees) + expect_equal(rf$forest$num.samples.nodes[[1]][1], nrow(iris)) + + expect_is(rf$forest$node.predictions, "list") + expect_length(rf$forest$node.predictions, rf$num.trees) + expect_is(rf$forest$node.predictions[[1]], "numeric") +}) + +test_that("if node.stats TRUE, nodestats saved, probability", { + rf <- ranger(Species ~ ., iris, num.trees = 5, probability = TRUE, node.stats = TRUE) + + expect_is(rf$forest$num.samples.nodes, "list") + expect_length(rf$forest$num.samples.nodes, rf$num.trees) + expect_equal(rf$forest$num.samples.nodes[[1]][1], nrow(iris)) + + expect_null(rf$forest$node.predictions) + + expect_is(rf$forest$terminal.class.counts, "list") + expect_length(rf$forest$terminal.class.counts, rf$num.trees) + expect_length(rf$forest$terminal.class.counts[[1]][[1]], nlevels(iris$Species)) +}) + +test_that("if node.stats TRUE, nodestats saved, regression", { + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5, node.stats = TRUE) + + expect_is(rf$forest$num.samples.nodes, "list") + expect_length(rf$forest$num.samples.nodes, rf$num.trees) + expect_equal(rf$forest$num.samples.nodes[[1]][1], nrow(iris)) + + expect_is(rf$forest$node.predictions, "list") + expect_length(rf$forest$node.predictions, rf$num.trees) + expect_is(rf$forest$node.predictions[[1]], "numeric") +}) + +test_that("if node.stats TRUE, nodestats saved, survival", { + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, node.stats = TRUE) + + expect_is(rf$forest$num.samples.nodes, "list") + expect_length(rf$forest$num.samples.nodes, rf$num.trees) + expect_equal(rf$forest$num.samples.nodes[[1]][1], nrow(veteran)) + + expect_null(rf$forest$node.predictions) + + expect_is(rf$forest$chf, "list") + expect_length(rf$forest$chf, rf$num.trees) + expect_is(rf$forest$chf[[1]], "list") + expect_is(rf$forest$chf[[1]][[1]], "numeric") + expect_length(rf$forest$chf[[1]][[1]], length(rf$unique.death.times)) +}) + + + +rf <- ranger(Species ~ ., iris, num.trees = 10, probability = TRUE, node.stats = TRUE) +rf$forest$num.samples.nodes +rf$forest$node.predictions +rf$forest$terminal.class.counts + + +rf <- ranger(Sepal.Length ~ ., iris, num.trees = 10, node.stats = TRUE) +rf$forest$num.samples.nodes +rf$forest$node.predictions + +# Survival + +rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 10, node.stats = TRUE) +rf$forest$num.samples.nodes +rf$forest$node.predictions +rf$forest$chf + + + From 7d370745818160af1a2849d381006d9f942a7c04 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 12 Sep 2023 22:27:17 +0200 Subject: [PATCH 077/111] add node stats to treeInfo() --- R/treeInfo.R | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/R/treeInfo.R b/R/treeInfo.R index f0a54d362..071399310 100644 --- a/R/treeInfo.R +++ b/R/treeInfo.R @@ -117,17 +117,35 @@ treeInfo <- function(object, tree = 1) { ## Prediction if (forest$treetype == "Classification") { - result$prediction <- forest$split.values[[tree]] - result$prediction[!result$terminal] <- NA + if (is.null(forest$num.samples.nodes)) { + # split.stats=FALSE + result$prediction <- forest$split.values[[tree]] + result$prediction[!result$terminal] <- NA + } else { + # split.stats=TRUE + result$prediction <- forest$node.predictions[[tree]] + } if (!is.null(forest$levels)) { result$prediction <- integer.to.factor(result$prediction, labels = forest$levels) } } else if (forest$treetype == "Regression") { - result$prediction <- forest$split.values[[tree]] - result$prediction[!result$terminal] <- NA + if (is.null(forest$num.samples.nodes)) { + # split.stats=FALSE + result$prediction <- forest$split.values[[tree]] + result$prediction[!result$terminal] <- NA + } else { + # split.stats=TRUE + result$prediction <- forest$node.predictions[[tree]] + } } else if (forest$treetype == "Probability estimation") { predictions <- matrix(nrow = nrow(result), ncol = length(forest$class.values)) - predictions[result$terminal, ] <- do.call(rbind, forest$terminal.class.counts[[tree]]) + if (is.null(forest$num.samples.nodes)) { + # split.stats=FALSE + predictions[result$terminal, ] <- do.call(rbind, forest$terminal.class.counts[[tree]]) + } else { + # split.stats=TRUE + predictions <- do.call(rbind, forest$terminal.class.counts[[tree]]) + } if (!is.null(forest$levels)) { colnames(predictions) <- forest$levels[forest$class.values] predictions <- predictions[, forest$levels[sort(forest$class.values)], drop = FALSE] @@ -142,5 +160,10 @@ treeInfo <- function(object, tree = 1) { stop("Error: Unknown tree type.") } + ## Node statistics + if (!is.null(forest$num.samples.nodes)) { + result$numSamples <- forest$num.samples.nodes[[tree]] + } + result } From 53f9c924c9d543297c6ac2693bc7a23886b76307 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 15 Sep 2023 14:34:03 +0200 Subject: [PATCH 078/111] add horizontal shrinkage --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 1 + R/RcppExports.R | 12 +++++ R/hshrink.R | 90 +++++++++++++++++++++++++++++++++++ man/hshrink.Rd | 29 +++++++++++ src/RcppExports.cpp | 51 ++++++++++++++++++++ src/utilityRcpp.cpp | 63 ++++++++++++++++++++++++ tests/testthat/test_hshrink.R | 68 ++++++++++++++++++++++++++ 9 files changed, 316 insertions(+), 1 deletion(-) create mode 100644 R/hshrink.R create mode 100644 man/hshrink.Rd create mode 100644 tests/testthat/test_hshrink.R diff --git a/DESCRIPTION b/DESCRIPTION index 20ef02c46..dc27d2a2a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests Version: 0.15.4 -Date: 2023-09-12 +Date: 2023-09-15 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NAMESPACE b/NAMESPACE index 3aad2ad21..64d06c481 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(csrf) export(deforest) export(getTerminalNodeIDs) export(holdoutRF) +export(hshrink) export(importance) export(importance_pvalues) export(predictions) diff --git a/NEWS.md b/NEWS.md index 0795b254e..402c6f71b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # ranger 0.15.4 * Add node.stats option to save node statistics of all nodes +* Add horizontal shrinkage # ranger 0.15.3 * Fix min bucket option in C++ version diff --git a/R/RcppExports.R b/R/RcppExports.R index 1de8f9d4c..3d8790001 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -13,3 +13,15 @@ randomObsNode <- function(groups, y, inbag_counts) { .Call(`_ranger_randomObsNode`, groups, y, inbag_counts) } +hshrink_regr <- function(left_children, right_children, num_samples_nodes, node_predictions, split_values, lambda, nodeID, parent_n, parent_pred, cum_sum) { + invisible(.Call(`_ranger_hshrink_regr`, left_children, right_children, num_samples_nodes, node_predictions, split_values, lambda, nodeID, parent_n, parent_pred, cum_sum)) +} + +hshrink_prob <- function(left_children, right_children, num_samples_nodes, class_freq, lambda, nodeID, parent_n, parent_pred, cum_sum) { + invisible(.Call(`_ranger_hshrink_prob`, left_children, right_children, num_samples_nodes, class_freq, lambda, nodeID, parent_n, parent_pred, cum_sum)) +} + +replace_class_counts <- function(class_counts_old, class_counts_new) { + invisible(.Call(`_ranger_replace_class_counts`, class_counts_old, class_counts_new)) +} + diff --git a/R/hshrink.R b/R/hshrink.R new file mode 100644 index 000000000..02c46f701 --- /dev/null +++ b/R/hshrink.R @@ -0,0 +1,90 @@ +# ------------------------------------------------------------------------------- +# This file is part of Ranger. +# +# Ranger is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Ranger is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ranger. If not, see . +# +# Written by: +# +# Marvin N. Wright +# Institut fuer Medizinische Biometrie und Statistik +# Universitaet zu Luebeck +# Ratzeburger Allee 160 +# 23562 Luebeck +# Germany +# +# http://www.imbs-luebeck.de +# ------------------------------------------------------------------------------- + + +#' Horizontal shrinkage +#' +#' Apply horizontal shrinkage to a ranger object. +#' Horizontal shrinkage is a regularization technique that recursively shrinks node predictions towards parent node predictions. +#' For details see Agarwal et al. (2022). +#' +#' @param rf ranger object, created with \code{node.stats = TRUE}. +#' @param lambda Non-negative shrinkage parameter. +#' +#' @return The ranger object is modified in-place. +#' +#' @examples +##' @references +##' \itemize{ +##' \item Agarwal, A., Tan, Y.S., Ronen, O., Singh, C. & Yu, B. (2022). Hierarchical Shrinkage: Improving the accuracy and interpretability of tree-based models. Proceedings of the 39th International Conference on Machine Learning, PMLR 162:111-135. +##' } +#' @author Marvin N. Wright +#' @export +hshrink <- function(rf, lambda) { + if (is.null(rf$forest$num.samples.nodes)) { + stop("Horizontal shrinkage needs node statistics, set node.stats=TRUE in ranger() call.") + } + if (lambda < 0) { + stop("Shrinkage parameter lambda has to be non-negative.") + } + + if (rf$treetype == "Regression") { + invisible(lapply(1:rf$num.trees, function(treeID) { + hshrink_regr( + rf$forest$child.nodeIDs[[treeID]][[1]], rf$forest$child.nodeIDs[[treeID]][[2]], + rf$forest$num.samples.nodes[[treeID]], rf$forest$node.predictions[[treeID]], + rf$forest$split.values[[treeID]], lambda, 0, 0, 0, 0 + ) + })) + } else if (rf$treetype == "Probability estimation") { + invisible(lapply(1:rf$num.trees, function(treeID) { + # Create temporary class frequency matrix + class_freq <- t(simplify2array(rf$forest$terminal.class.counts[[treeID]])) + + parent_pred <- rep(0, length(rf$forest$class.values)) + cum_sum <- rep(0, length(rf$forest$class.values)) + hshrink_prob( + rf$forest$child.nodeIDs[[treeID]][[1]], rf$forest$child.nodeIDs[[treeID]][[2]], + rf$forest$num.samples.nodes[[treeID]], class_freq, + lambda, 0, 0, parent_pred, cum_sum + ) + + # Assign temporary matrix values back to ranger object + replace_class_counts(rf$forest$terminal.class.counts[[treeID]], class_freq) + })) + } else if (rf$treetype == "Classification") { + stop("To apply horizontal shrinkage to classification forests, use probability=TRUE in the ranger() call.") + } else if (rf$treetype == "Survival") { + stop("Horizontal shrinkage not yet implemented for survival.") + } else { + stop("Unknown treetype.") + } + +} + + diff --git a/man/hshrink.Rd b/man/hshrink.Rd new file mode 100644 index 000000000..df57f1bc7 --- /dev/null +++ b/man/hshrink.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hshrink.R +\name{hshrink} +\alias{hshrink} +\title{Horizontal shrinkage} +\usage{ +hshrink(rf, lambda) +} +\arguments{ +\item{rf}{ranger object, created with \code{node.stats = TRUE}.} + +\item{lambda}{Non-negative shrinkage parameter.} +} +\value{ +The ranger object is modified in-place. +} +\description{ +Apply horizontal shrinkage to a ranger object. +Horizontal shrinkage is a regularization technique that recursively shrinks node predictions towards parent node predictions. +For details see Agarwal et al. (2022). +} +\references{ +\itemize{ + \item Agarwal, A., Tan, Y.S., Ronen, O., Singh, C. & Yu, B. (2022). Hierarchical Shrinkage: Improving the accuracy and interpretability of tree-based models. Proceedings of the 39th International Conference on Machine Learning, PMLR 162:111-135. + } +} +\author{ +Marvin N. Wright +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 4cc76e598..ead9a011b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -95,11 +95,62 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// hshrink_regr +void hshrink_regr(Rcpp::IntegerVector& left_children, Rcpp::IntegerVector& right_children, Rcpp::IntegerVector& num_samples_nodes, Rcpp::NumericVector& node_predictions, Rcpp::NumericVector& split_values, double lambda, size_t nodeID, size_t parent_n, double parent_pred, double cum_sum); +RcppExport SEXP _ranger_hshrink_regr(SEXP left_childrenSEXP, SEXP right_childrenSEXP, SEXP num_samples_nodesSEXP, SEXP node_predictionsSEXP, SEXP split_valuesSEXP, SEXP lambdaSEXP, SEXP nodeIDSEXP, SEXP parent_nSEXP, SEXP parent_predSEXP, SEXP cum_sumSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::IntegerVector& >::type left_children(left_childrenSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector& >::type right_children(right_childrenSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector& >::type num_samples_nodes(num_samples_nodesSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericVector& >::type node_predictions(node_predictionsSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericVector& >::type split_values(split_valuesSEXP); + Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); + Rcpp::traits::input_parameter< size_t >::type nodeID(nodeIDSEXP); + Rcpp::traits::input_parameter< size_t >::type parent_n(parent_nSEXP); + Rcpp::traits::input_parameter< double >::type parent_pred(parent_predSEXP); + Rcpp::traits::input_parameter< double >::type cum_sum(cum_sumSEXP); + hshrink_regr(left_children, right_children, num_samples_nodes, node_predictions, split_values, lambda, nodeID, parent_n, parent_pred, cum_sum); + return R_NilValue; +END_RCPP +} +// hshrink_prob +void hshrink_prob(Rcpp::IntegerVector& left_children, Rcpp::IntegerVector& right_children, Rcpp::IntegerVector& num_samples_nodes, Rcpp::NumericMatrix& class_freq, double lambda, size_t nodeID, size_t parent_n, Rcpp::NumericVector parent_pred, Rcpp::NumericVector cum_sum); +RcppExport SEXP _ranger_hshrink_prob(SEXP left_childrenSEXP, SEXP right_childrenSEXP, SEXP num_samples_nodesSEXP, SEXP class_freqSEXP, SEXP lambdaSEXP, SEXP nodeIDSEXP, SEXP parent_nSEXP, SEXP parent_predSEXP, SEXP cum_sumSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::IntegerVector& >::type left_children(left_childrenSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector& >::type right_children(right_childrenSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector& >::type num_samples_nodes(num_samples_nodesSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericMatrix& >::type class_freq(class_freqSEXP); + Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); + Rcpp::traits::input_parameter< size_t >::type nodeID(nodeIDSEXP); + Rcpp::traits::input_parameter< size_t >::type parent_n(parent_nSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericVector >::type parent_pred(parent_predSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericVector >::type cum_sum(cum_sumSEXP); + hshrink_prob(left_children, right_children, num_samples_nodes, class_freq, lambda, nodeID, parent_n, parent_pred, cum_sum); + return R_NilValue; +END_RCPP +} +// replace_class_counts +void replace_class_counts(Rcpp::List& class_counts_old, Rcpp::NumericMatrix& class_counts_new); +RcppExport SEXP _ranger_replace_class_counts(SEXP class_counts_oldSEXP, SEXP class_counts_newSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::List& >::type class_counts_old(class_counts_oldSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericMatrix& >::type class_counts_new(class_counts_newSEXP); + replace_class_counts(class_counts_old, class_counts_new); + return R_NilValue; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 48}, {"_ranger_numSmaller", (DL_FUNC) &_ranger_numSmaller, 2}, {"_ranger_randomObsNode", (DL_FUNC) &_ranger_randomObsNode, 3}, + {"_ranger_hshrink_regr", (DL_FUNC) &_ranger_hshrink_regr, 10}, + {"_ranger_hshrink_prob", (DL_FUNC) &_ranger_hshrink_prob, 9}, + {"_ranger_replace_class_counts", (DL_FUNC) &_ranger_replace_class_counts, 2}, {NULL, NULL, 0} }; diff --git a/src/utilityRcpp.cpp b/src/utilityRcpp.cpp index dbad4a683..9866702df 100644 --- a/src/utilityRcpp.cpp +++ b/src/utilityRcpp.cpp @@ -89,3 +89,66 @@ Rcpp::NumericMatrix randomObsNode(Rcpp::IntegerMatrix groups, Rcpp::NumericVecto return result; } +// Recursive function for horizontal shrinkage (regression) +//[[Rcpp::export]] +void hshrink_regr(Rcpp::IntegerVector& left_children, Rcpp::IntegerVector& right_children, + Rcpp::IntegerVector& num_samples_nodes, Rcpp::NumericVector& node_predictions, + Rcpp::NumericVector& split_values, double lambda, + size_t nodeID, size_t parent_n, double parent_pred, double cum_sum) { + if (nodeID == 0) { + // In the root, just use the prediction + cum_sum = node_predictions[nodeID]; + } else { + // If not root, use shrinkage formula + cum_sum += (node_predictions[nodeID] - parent_pred) / (1 + lambda/parent_n); + } + + if (left_children[nodeID] == 0) { + // If leaf, change node prediction in split_values (used for prediction) + split_values[nodeID] = cum_sum; + } else { + // If not leaf, give weighted prediction to child nodes + hshrink_regr(left_children, right_children, num_samples_nodes, node_predictions, split_values, + lambda, left_children[nodeID], num_samples_nodes[nodeID], node_predictions[nodeID], + cum_sum); + hshrink_regr(left_children, right_children, num_samples_nodes, node_predictions, split_values, + lambda, right_children[nodeID], num_samples_nodes[nodeID], node_predictions[nodeID], + cum_sum); + } +} + +// Recursive function for horizontal shrinkage (probability) +//[[Rcpp::export]] +void hshrink_prob(Rcpp::IntegerVector& left_children, Rcpp::IntegerVector& right_children, + Rcpp::IntegerVector& num_samples_nodes, + Rcpp::NumericMatrix& class_freq, double lambda, + size_t nodeID, size_t parent_n, Rcpp::NumericVector parent_pred, Rcpp::NumericVector cum_sum) { + + if (nodeID == 0) { + // In the root, just use the prediction + cum_sum = class_freq(nodeID, Rcpp::_); + } else { + // If not root, use shrinkage formula + cum_sum += (class_freq(nodeID, Rcpp::_) - parent_pred) / (1 + lambda/parent_n); + } + + if (left_children[nodeID] == 0) { + // If leaf, change node prediction in split_values (used for prediction) + class_freq(nodeID, Rcpp::_) = cum_sum; + } else { + // If not leaf, give weighted prediction to child nodes + hshrink_prob(left_children, right_children, num_samples_nodes, class_freq, lambda, + left_children[nodeID], num_samples_nodes[nodeID], class_freq(nodeID, Rcpp::_), clone(cum_sum)); + hshrink_prob(left_children, right_children, num_samples_nodes, class_freq, lambda, + right_children[nodeID], num_samples_nodes[nodeID], class_freq(nodeID, Rcpp::_), clone(cum_sum)); + } +} + +// Replace class counts list(vector) with values from matrix +//[[Rcpp::export]] +void replace_class_counts(Rcpp::List& class_counts_old, Rcpp::NumericMatrix& class_counts_new) { + for (size_t i = 0; i < class_counts_old.size(); ++i) { + class_counts_old[i] = class_counts_new(i, Rcpp::_); + } +} + diff --git a/tests/testthat/test_hshrink.R b/tests/testthat/test_hshrink.R new file mode 100644 index 000000000..e6a43fcaa --- /dev/null +++ b/tests/testthat/test_hshrink.R @@ -0,0 +1,68 @@ +## Tests for hierarchical shrinkage + +library(ranger) +context("ranger_hshrink") + +## Tests +test_that("horizontal shrinkage gives an error when node.stats=FALSE", { + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 1, node.stats = FALSE) + expect_error(hshrink(rf, lambda = 5)) +}) + +test_that("horizontal shrinkage does not work for hard classification", { + rf <- ranger(Species ~ ., iris, num.trees = 1, node.stats = TRUE, probability = FALSE) + expect_error(hshrink(rf, lambda = 5)) +}) + +test_that("horizontal shrinkage with lambda=0 doesn't change leafs and prediction, regression", { + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 1, node.stats = TRUE) + split_values_before <- rf$forest$split.values[[1]] + pred_before <- predict(rf, iris)$predictions + hshrink(rf, lambda = 0) + split_values_after <- rf$forest$split.values[[1]] + pred_after <- predict(rf, iris)$predictions + expect_equal(split_values_before, split_values_after) + expect_equal(pred_before, pred_after) +}) + +test_that("horizontal shrinkage with lambda=0 doesn't change leafs and prediction, probability", { + rf <- ranger(Species ~ ., iris, num.trees = 1, node.stats = TRUE, probability = TRUE) + class_freq_before <- simplify2array(rf$forest$terminal.class.counts[[1]]) + pred_before <- predict(rf, iris)$predictions + hshrink(rf, lambda = 0) + class_freq_after <- simplify2array(rf$forest$terminal.class.counts[[1]]) + pred_after <- predict(rf, iris)$predictions + expect_equal(class_freq_before, class_freq_after) + expect_equal(pred_before, pred_after) +}) + +test_that("horizontal shrinkage with lambda>0 does change leafs and prediction, regression", { + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 1, replace = FALSE, sample.fraction = 1, node.stats = TRUE) + split_values_before <- rf$forest$split.values[[1]] + pred_before <- predict(rf, iris)$predictions + split_values_before[1] <- 0 # Modify to create deep copy + hshrink(rf, lambda = 100) + split_values_after <- rf$forest$split.values[[1]] + split_values_after[1] <- 0 # Also modify here + pred_after <- predict(rf, iris)$predictions + expect_false(all(split_values_before == split_values_after)) + + # Shrinkage reduces variance + expect_lt(var(pred_after), var(pred_before)) + +}) + +test_that("horizontal shrinkage with lambda>0 does change leafs and prediction, probability", { + rf <- ranger(Species ~ ., iris, num.trees = 1, node.stats = TRUE, probability = TRUE) + class_freq_before <- simplify2array(rf$forest$terminal.class.counts[[1]]) + pred_before <- predict(rf, iris)$predictions + hshrink(rf, lambda = 100) + class_freq_after <- simplify2array(rf$forest$terminal.class.counts[[1]]) + pred_after <- predict(rf, iris)$predictions + expect_false(all(class_freq_before == class_freq_after)) + + # Shrinkage reduces variance + expect_lt(var(pred_after[, 1]), var(pred_before[, 1])) + expect_lt(var(pred_after[, 2]), var(pred_before[, 2])) + expect_lt(var(pred_after[, 3]), var(pred_before[, 3])) +}) From 22967926cc3ed90418fff3f021893a8ed989fe4a Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 15 Sep 2023 14:37:45 +0200 Subject: [PATCH 079/111] hierarchical not horizontal --- NEWS.md | 2 +- R/hshrink.R | 12 ++++++------ man/hshrink.Rd | 6 +++--- src/utilityRcpp.cpp | 4 ++-- tests/testthat/test_hshrink.R | 12 ++++++------ 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 402c6f71b..d18c08f71 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,7 @@ # ranger 0.15.4 * Add node.stats option to save node statistics of all nodes -* Add horizontal shrinkage +* Add hierarchical shrinkage # ranger 0.15.3 * Fix min bucket option in C++ version diff --git a/R/hshrink.R b/R/hshrink.R index 02c46f701..001866f4f 100644 --- a/R/hshrink.R +++ b/R/hshrink.R @@ -27,10 +27,10 @@ # ------------------------------------------------------------------------------- -#' Horizontal shrinkage +#' Hierarchical shrinkage #' -#' Apply horizontal shrinkage to a ranger object. -#' Horizontal shrinkage is a regularization technique that recursively shrinks node predictions towards parent node predictions. +#' Apply hierarchical shrinkage to a ranger object. +#' Hierarchical shrinkage is a regularization technique that recursively shrinks node predictions towards parent node predictions. #' For details see Agarwal et al. (2022). #' #' @param rf ranger object, created with \code{node.stats = TRUE}. @@ -47,7 +47,7 @@ #' @export hshrink <- function(rf, lambda) { if (is.null(rf$forest$num.samples.nodes)) { - stop("Horizontal shrinkage needs node statistics, set node.stats=TRUE in ranger() call.") + stop("Hierarchical shrinkage needs node statistics, set node.stats=TRUE in ranger() call.") } if (lambda < 0) { stop("Shrinkage parameter lambda has to be non-negative.") @@ -78,9 +78,9 @@ hshrink <- function(rf, lambda) { replace_class_counts(rf$forest$terminal.class.counts[[treeID]], class_freq) })) } else if (rf$treetype == "Classification") { - stop("To apply horizontal shrinkage to classification forests, use probability=TRUE in the ranger() call.") + stop("To apply hierarchical shrinkage to classification forests, use probability=TRUE in the ranger() call.") } else if (rf$treetype == "Survival") { - stop("Horizontal shrinkage not yet implemented for survival.") + stop("Hierarchical shrinkage not yet implemented for survival.") } else { stop("Unknown treetype.") } diff --git a/man/hshrink.Rd b/man/hshrink.Rd index df57f1bc7..e48c9a2e9 100644 --- a/man/hshrink.Rd +++ b/man/hshrink.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/hshrink.R \name{hshrink} \alias{hshrink} -\title{Horizontal shrinkage} +\title{Hierarchical shrinkage} \usage{ hshrink(rf, lambda) } @@ -15,8 +15,8 @@ hshrink(rf, lambda) The ranger object is modified in-place. } \description{ -Apply horizontal shrinkage to a ranger object. -Horizontal shrinkage is a regularization technique that recursively shrinks node predictions towards parent node predictions. +Apply hierarchical shrinkage to a ranger object. +Hierarchical shrinkage is a regularization technique that recursively shrinks node predictions towards parent node predictions. For details see Agarwal et al. (2022). } \references{ diff --git a/src/utilityRcpp.cpp b/src/utilityRcpp.cpp index 9866702df..799d57d8d 100644 --- a/src/utilityRcpp.cpp +++ b/src/utilityRcpp.cpp @@ -89,7 +89,7 @@ Rcpp::NumericMatrix randomObsNode(Rcpp::IntegerMatrix groups, Rcpp::NumericVecto return result; } -// Recursive function for horizontal shrinkage (regression) +// Recursive function for hierarchical shrinkage (regression) //[[Rcpp::export]] void hshrink_regr(Rcpp::IntegerVector& left_children, Rcpp::IntegerVector& right_children, Rcpp::IntegerVector& num_samples_nodes, Rcpp::NumericVector& node_predictions, @@ -117,7 +117,7 @@ void hshrink_regr(Rcpp::IntegerVector& left_children, Rcpp::IntegerVector& right } } -// Recursive function for horizontal shrinkage (probability) +// Recursive function for hierarchical shrinkage (probability) //[[Rcpp::export]] void hshrink_prob(Rcpp::IntegerVector& left_children, Rcpp::IntegerVector& right_children, Rcpp::IntegerVector& num_samples_nodes, diff --git a/tests/testthat/test_hshrink.R b/tests/testthat/test_hshrink.R index e6a43fcaa..94f2ddbc5 100644 --- a/tests/testthat/test_hshrink.R +++ b/tests/testthat/test_hshrink.R @@ -4,17 +4,17 @@ library(ranger) context("ranger_hshrink") ## Tests -test_that("horizontal shrinkage gives an error when node.stats=FALSE", { +test_that("hierarchical shrinkage gives an error when node.stats=FALSE", { rf <- ranger(Sepal.Length ~ ., iris, num.trees = 1, node.stats = FALSE) expect_error(hshrink(rf, lambda = 5)) }) -test_that("horizontal shrinkage does not work for hard classification", { +test_that("hierarchical shrinkage does not work for hard classification", { rf <- ranger(Species ~ ., iris, num.trees = 1, node.stats = TRUE, probability = FALSE) expect_error(hshrink(rf, lambda = 5)) }) -test_that("horizontal shrinkage with lambda=0 doesn't change leafs and prediction, regression", { +test_that("hierarchical shrinkage with lambda=0 doesn't change leafs and prediction, regression", { rf <- ranger(Sepal.Length ~ ., iris, num.trees = 1, node.stats = TRUE) split_values_before <- rf$forest$split.values[[1]] pred_before <- predict(rf, iris)$predictions @@ -25,7 +25,7 @@ test_that("horizontal shrinkage with lambda=0 doesn't change leafs and predictio expect_equal(pred_before, pred_after) }) -test_that("horizontal shrinkage with lambda=0 doesn't change leafs and prediction, probability", { +test_that("hierarchical shrinkage with lambda=0 doesn't change leafs and prediction, probability", { rf <- ranger(Species ~ ., iris, num.trees = 1, node.stats = TRUE, probability = TRUE) class_freq_before <- simplify2array(rf$forest$terminal.class.counts[[1]]) pred_before <- predict(rf, iris)$predictions @@ -36,7 +36,7 @@ test_that("horizontal shrinkage with lambda=0 doesn't change leafs and predictio expect_equal(pred_before, pred_after) }) -test_that("horizontal shrinkage with lambda>0 does change leafs and prediction, regression", { +test_that("hierarchical shrinkage with lambda>0 does change leafs and prediction, regression", { rf <- ranger(Sepal.Length ~ ., iris, num.trees = 1, replace = FALSE, sample.fraction = 1, node.stats = TRUE) split_values_before <- rf$forest$split.values[[1]] pred_before <- predict(rf, iris)$predictions @@ -52,7 +52,7 @@ test_that("horizontal shrinkage with lambda>0 does change leafs and prediction, }) -test_that("horizontal shrinkage with lambda>0 does change leafs and prediction, probability", { +test_that("hierarchical shrinkage with lambda>0 does change leafs and prediction, probability", { rf <- ranger(Species ~ ., iris, num.trees = 1, node.stats = TRUE, probability = TRUE) class_freq_before <- simplify2array(rf$forest$terminal.class.counts[[1]]) pred_before <- predict(rf, iris)$predictions From d545c2793d5e3c7a0679272af1d985f1a3aa9e9b Mon Sep 17 00:00:00 2001 From: Lukas Burk Date: Mon, 23 Oct 2023 13:17:35 +0200 Subject: [PATCH 080/111] Add dependabot.yml to auto-update github actions --- .github/dependabot.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..28be3a994 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,8 @@ +version: 2 + +updates: + # Keep dependencies for GitHub Actions up-to-date + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" From 00500dc70dacd39281e3ffbc9402bfc6b319cec5 Mon Sep 17 00:00:00 2001 From: talegari Date: Thu, 26 Oct 2023 23:41:18 +0530 Subject: [PATCH 081/111] added depvar to result --- R/ranger.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/ranger.R b/R/ranger.R index 54a18342f..a5a7aba90 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -146,6 +146,7 @@ ##' \item{\code{importance.mode}}{Importance mode used.} ##' \item{\code{num.samples}}{Number of samples.} ##' \item{\code{inbag.counts}}{Number of times the observations are in-bag in the trees.} +##' \item{\code{dependent.variable.name}}{Name of the dependent variable. This is NULL when x/y interface is used.} ##' @examples ##' ## Classification forest with default settings ##' ranger(Species ~ ., data = iris) @@ -276,6 +277,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, stop("Error: Invalid formula.") } data.selected <- parse.formula(formula, data, env = parent.frame()) + dependent.variable.name = names(data.selected)[1] y <- data.selected[, 1] x <- data.selected[, -1, drop = FALSE] } @@ -974,6 +976,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } } + ## slot: dependent.variable.name + ## will be NULL only when x/y interface is used + result$dependent.variable.name = dependent.variable.name + class(result) <- "ranger" ## Prepare quantile prediction From 90937806db8ff00d65f9a7af7bff0c555d6feecc Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Nov 2023 11:53:32 +0100 Subject: [PATCH 082/111] add time.interest to reduce unique time points --- DESCRIPTION | 4 +-- NEWS.md | 3 +++ R/RcppExports.R | 4 +-- R/predict.R | 5 +++- R/ranger.R | 27 +++++++++++++++++-- man/ranger.Rd | 3 +++ src/ForestSurvival.cpp | 61 +++++++++++++++++++++++++++--------------- src/ForestSurvival.h | 2 ++ src/RcppExports.cpp | 10 ++++--- src/rangerCpp.cpp | 12 ++++++++- 10 files changed, 98 insertions(+), 33 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6ca004781..66e395ac3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.15.3 -Date: 2023-07-19 +Version: 0.15.4 +Date: 2023-11-03 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS.md b/NEWS.md index 22a8a8d64..e07a5cc49 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.15.4 +* Add time.interest option to restrict unique survival times (faster and saves memory) + # ranger 0.15.3 * Fix min bucket option in C++ version diff --git a/R/RcppExports.R b/R/RcppExports.R index 19cc8e8ac..83507c1ed 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) { - .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) +rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, time_interest, use_time_interest) { + .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, time_interest, use_time_interest) } numSmaller <- function(values, reference) { diff --git a/R/predict.R b/R/predict.R index ef1397e73..c53c0da2b 100644 --- a/R/predict.R +++ b/R/predict.R @@ -250,6 +250,8 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, regularization.factor <- c(0, 0) use.regularization.factor <- FALSE regularization.usedepth <- FALSE + time.interest <- c(0, 0) + use.time.interest <- FALSE ## Use sparse matrix if (inherits(x, "dgCMatrix")) { @@ -273,7 +275,8 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, - regularization.factor, use.regularization.factor, regularization.usedepth) + regularization.factor, use.regularization.factor, regularization.usedepth, + time.interest, use.time.interest) if (length(result) == 0) { stop("User interrupt or internal error.") diff --git a/R/ranger.R b/R/ranger.R index 54a18342f..bac1b6dbb 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -115,6 +115,7 @@ ##' @param inbag Manually set observations per tree. List of size num.trees, containing inbag counts for each observation. Can be used for stratified sampling. ##' @param holdout Hold-out mode. Hold-out all samples with case weight 0 and use these for variable importance and prediction error. ##' @param quantreg Prepare quantile prediction as in quantile regression forests (Meinshausen 2006). Regression only. Set \code{keep.inbag = TRUE} to prepare out-of-bag quantile prediction. +##' @param time.interest Time points of interest (survival only). Can be \code{NULL} (default, use all observed time points), a vector of time points or a single number to use as many time points (grid over observed time points). ##' @param oob.error Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests. ##' @param num.threads Number of threads. Default is number of CPUs available. ##' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. @@ -222,7 +223,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, local.importance = FALSE, regularization.factor = 1, regularization.usedepth = FALSE, keep.inbag = FALSE, inbag = NULL, holdout = FALSE, - quantreg = FALSE, oob.error = TRUE, + quantreg = FALSE, time.interest = NULL, oob.error = TRUE, num.threads = NULL, save.memory = FALSE, verbose = TRUE, seed = NULL, dependent.variable.name = NULL, status.variable.name = NULL, @@ -822,6 +823,27 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } } + ## Time of interest + if (is.null(time.interest)) { + time.interest <- c(0, 0) + use.time.interest <- FALSE + } else { + use.time.interest <- TRUE + if (treetype != 5) { + stop("Error: time.interest only applicable to survival forests.") + } + if (is.numeric(time.interest) & length(time.interest) == 1) { + if (time.interest < 1) { + stop("Error: time.interest must be a positive integer.") + } + # Grid over observed time points + time <- sort(unique(y[, 1])) + time.interest <- time[unique(round(seq.int(1, length(time), length.out = time.interest)))] + } else { + time.interest <- sort(unique(time.interest)) + } + } + ## Prediction mode always false. Use predict.ranger() method. prediction.mode <- FALSE predict.all <- FALSE @@ -873,7 +895,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, - regularization.factor, use.regularization.factor, regularization.usedepth) + regularization.factor, use.regularization.factor, regularization.usedepth, + time.interest, use.time.interest) if (length(result) == 0) { stop("User interrupt or internal error.") diff --git a/man/ranger.Rd b/man/ranger.Rd index 63d6d395e..ff202a074 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -34,6 +34,7 @@ ranger( inbag = NULL, holdout = FALSE, quantreg = FALSE, + time.interest = NULL, oob.error = TRUE, num.threads = NULL, save.memory = FALSE, @@ -106,6 +107,8 @@ ranger( \item{quantreg}{Prepare quantile prediction as in quantile regression forests (Meinshausen 2006). Regression only. Set \code{keep.inbag = TRUE} to prepare out-of-bag quantile prediction.} +\item{time.interest}{Time points of interest (survival only). Can be \code{NULL} (default, use all observed time points), a vector of time points or a single number to use as many time points (grid over observed time points).} + \item{oob.error}{Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests.} \item{num.threads}{Number of threads. Default is number of CPUs available.} diff --git a/src/ForestSurvival.cpp b/src/ForestSurvival.cpp index 0b20dff89..19e766aa4 100644 --- a/src/ForestSurvival.cpp +++ b/src/ForestSurvival.cpp @@ -42,6 +42,39 @@ void ForestSurvival::loadForest(size_t num_trees, std::vector& time_interest) { + + if (time_interest.empty()) { + // Use all observed unique time points + std::set unique_timepoint_set; + for (size_t i = 0; i < num_samples; ++i) { + unique_timepoint_set.insert(data->get_y(i, 0)); + } + unique_timepoints.reserve(unique_timepoint_set.size()); + for (auto& t : unique_timepoint_set) { + unique_timepoints.push_back(t); + } + } else { + // Use the supplied time points of interest + unique_timepoints = time_interest; + } + + // Create response_timepointIDs + for (size_t i = 0; i < num_samples; ++i) { + double value = data->get_y(i, 0); + + // If timepoint is already in unique_timepoints, use ID. Else create a new one. + uint timepointID = 0; + if (value > unique_timepoints[0]) { + timepointID = std::upper_bound(unique_timepoints.begin(), unique_timepoints.end(), value) - 1 - unique_timepoints.begin(); + } + if (timepointID < 0) { + timepointID = 0; + } + response_timepointIDs.push_back(timepointID); + } +} + std::vector>> ForestSurvival::getChf() const { std::vector>> result; result.reserve(num_trees); @@ -70,27 +103,6 @@ void ForestSurvival::initInternal() { min_bucket = DEFAULT_MIN_BUCKET_SURVIVAL; } - // Create unique timepoints - if (!prediction_mode) { - std::set unique_timepoint_set; - for (size_t i = 0; i < num_samples; ++i) { - unique_timepoint_set.insert(data->get_y(i, 0)); - } - unique_timepoints.reserve(unique_timepoint_set.size()); - for (auto& t : unique_timepoint_set) { - unique_timepoints.push_back(t); - } - - // Create response_timepointIDs - for (size_t i = 0; i < num_samples; ++i) { - double value = data->get_y(i, 0); - - // If timepoint is already in unique_timepoints, use ID. Else create a new one. - uint timepointID = find(unique_timepoints.begin(), unique_timepoints.end(), value) - unique_timepoints.begin(); - response_timepointIDs.push_back(timepointID); - } - } - // Sort data if extratrees and not memory saving mode if (splitrule == EXTRATREES && !memory_saving_splitting) { data->sort(); @@ -98,6 +110,13 @@ void ForestSurvival::initInternal() { } void ForestSurvival::growInternal() { + + // If unique time points not set, use observed times + if (unique_timepoints.empty()) { + setUniqueTimepoints(std::vector()); + } + + trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { trees.push_back(std::make_unique(&unique_timepoints, &response_timepointIDs)); diff --git a/src/ForestSurvival.h b/src/ForestSurvival.h index 15b8a87fe..d2efe4a82 100644 --- a/src/ForestSurvival.h +++ b/src/ForestSurvival.h @@ -34,6 +34,8 @@ class ForestSurvival: public Forest { std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector> >& forest_chf, std::vector& unique_timepoints, std::vector& is_ordered_variable); + + void setUniqueTimepoints(const std::vector& time_interest); std::vector>> getChf() const; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 65b57caba..97900517f 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -13,8 +13,8 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // rangerCpp -Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth); -RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP min_bucketSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP) { +Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, std::vector& time_interest, bool use_time_interest); +RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP min_bucketSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP, SEXP time_interestSEXP, SEXP use_time_interestSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -65,7 +65,9 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::vector& >::type regularization_factor(regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type use_regularization_factor(use_regularization_factorSEXP); Rcpp::traits::input_parameter< bool >::type regularization_usedepth(regularization_usedepthSEXP); - rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)); + Rcpp::traits::input_parameter< std::vector& >::type time_interest(time_interestSEXP); + Rcpp::traits::input_parameter< bool >::type use_time_interest(use_time_interestSEXP); + rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, time_interest, use_time_interest)); return rcpp_result_gen; END_RCPP } @@ -96,7 +98,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 47}, + {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 49}, {"_ranger_numSmaller", (DL_FUNC) &_ranger_numSmaller, 2}, {"_ranger_randomObsNode", (DL_FUNC) &_ranger_randomObsNode, 3}, {NULL, NULL, 0} diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index e743ca151..a2baee9af 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -61,7 +61,8 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, - std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth) { + std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, + std::vector& time_interest, bool use_time_interest) { Rcpp::List result; @@ -88,6 +89,9 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM if (!use_regularization_factor) { regularization_factor.clear(); } + if (!use_time_interest) { + time_interest.clear(); + } std::ostream* verbose_out; if (verbose) { @@ -191,6 +195,12 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM auto& temp = dynamic_cast(*forest); temp.setClassWeights(class_weights); } + + // Set time points of interest + if (treetype == TREE_SURVIVAL && !time_interest.empty()) { + auto& temp = dynamic_cast(*forest); + temp.setUniqueTimepoints(time_interest); + } } // Run Ranger From b2232fc7d69cb12dbc71cc34b51aabd68d1eefd0 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 3 Nov 2023 12:47:39 +0100 Subject: [PATCH 083/111] add tests for time.interest --- R/ranger.R | 6 +++- tests/testthat/test_survival.R | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/R/ranger.R b/R/ranger.R index bac1b6dbb..99039df90 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -838,7 +838,11 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } # Grid over observed time points time <- sort(unique(y[, 1])) - time.interest <- time[unique(round(seq.int(1, length(time), length.out = time.interest)))] + if (length(time) <= time.interest) { + time.interest <- time + } else { + time.interest <- time[unique(round(seq.int(1, length(time), length.out = time.interest)))] + } } else { time.interest <- sort(unique(time.interest)) } diff --git a/tests/testthat/test_survival.R b/tests/testthat/test_survival.R index d0fb0aafe..e675cf605 100644 --- a/tests/testthat/test_survival.R +++ b/tests/testthat/test_survival.R @@ -124,3 +124,57 @@ test_that("Survival error for competing risk data", { expect_error(ranger(y = sobj, x = veteran[, 1:2], num.trees = 5), "Error: Competing risks not supported yet\\. Use status=1 for events and status=0 for censoring\\.") }) + +test_that("Right unique time points without time.interest", { + times <- sort(unique(veteran$time)) + + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) + expect_equal(timepoints(rf), times) + + rf <- ranger(y = Surv(veteran$time, veteran$status), x = veteran[, c(-3, -4)], num.trees = 5) + expect_equal(timepoints(rf), times) +}) + +test_that("time.interest results in the right number of time points", { + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, time.interest = 20) + expect_equal(length(timepoints(rf)), 20) + + rf <- ranger(y = Surv(veteran$time, veteran$status), x = veteran[, c(-3, -4)], + num.trees = 5, time.interest = 20) + expect_equal(length(timepoints(rf)), 20) + + rf <- ranger(y = cbind(veteran$time, veteran$status), x = veteran[, c(-3, -4)], + num.trees = 5, time.interest = 20) + expect_equal(length(timepoints(rf)), 20) + + rf <- ranger(dependent.variable.name = "time", status.variable.name = "status", + data = veteran, num.trees = 5, time.interest = 20) + expect_equal(length(timepoints(rf)), 20) +}) + +test_that("time.interest results in the right time points", { + times <- c(20, 100, 200, 1000) + + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, time.interest = times) + expect_equal(timepoints(rf), times) + + rf <- ranger(y = Surv(veteran$time, veteran$status), x = veteran[, c(-3, -4)], + num.trees = 5, time.interest = times) + expect_equal(timepoints(rf), times) + + rf <- ranger(y = cbind(veteran$time, veteran$status), x = veteran[, c(-3, -4)], + num.trees = 5, time.interest = times) + expect_equal(timepoints(rf), times) + + rf <- ranger(dependent.variable.name = "time", status.variable.name = "status", + data = veteran, num.trees = 5, time.interest = times) + expect_equal(timepoints(rf), times) +}) + +test_that("If more unique time points requested then observed, use observed times", { + times <- sort(unique(veteran$time)) + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, time.interest = 200) + expect_equal(timepoints(rf), times) +}) + + From c9057d092d791727eb77257eb74ca369f2a14a42 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 7 Nov 2023 16:55:50 +0100 Subject: [PATCH 084/111] use only event times --- R/ranger.R | 3 ++- src/ForestSurvival.cpp | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index 99039df90..b9be605b8 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -837,7 +837,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, stop("Error: time.interest must be a positive integer.") } # Grid over observed time points - time <- sort(unique(y[, 1])) + nocens <- y[, 2] > 0 + time <- sort(unique(y[nocens, 1])) if (length(time) <= time.interest) { time.interest <- time } else { diff --git a/src/ForestSurvival.cpp b/src/ForestSurvival.cpp index 19e766aa4..3b88e3122 100644 --- a/src/ForestSurvival.cpp +++ b/src/ForestSurvival.cpp @@ -48,7 +48,9 @@ void ForestSurvival::setUniqueTimepoints(const std::vector& time_interes // Use all observed unique time points std::set unique_timepoint_set; for (size_t i = 0; i < num_samples; ++i) { - unique_timepoint_set.insert(data->get_y(i, 0)); + if (data->get_y(i, 1) > 0) { + unique_timepoint_set.insert(data->get_y(i, 0)); + } } unique_timepoints.reserve(unique_timepoint_set.size()); for (auto& t : unique_timepoint_set) { @@ -65,8 +67,10 @@ void ForestSurvival::setUniqueTimepoints(const std::vector& time_interes // If timepoint is already in unique_timepoints, use ID. Else create a new one. uint timepointID = 0; - if (value > unique_timepoints[0]) { - timepointID = std::upper_bound(unique_timepoints.begin(), unique_timepoints.end(), value) - 1 - unique_timepoints.begin(); + if (value > unique_timepoints[unique_timepoints.size() - 1]) { + timepointID = unique_timepoints.size() - 1; + } else if (value > unique_timepoints[0]) { + timepointID = std::lower_bound(unique_timepoints.begin(), unique_timepoints.end(), value) - unique_timepoints.begin(); } if (timepointID < 0) { timepointID = 0; From d3768b36aa7c010c6662c818c4d19c6777da814b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 7 Nov 2023 17:00:18 +0100 Subject: [PATCH 085/111] fix tests --- tests/testthat/test_survival.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_survival.R b/tests/testthat/test_survival.R index e675cf605..8a91bcc52 100644 --- a/tests/testthat/test_survival.R +++ b/tests/testthat/test_survival.R @@ -126,7 +126,7 @@ test_that("Survival error for competing risk data", { }) test_that("Right unique time points without time.interest", { - times <- sort(unique(veteran$time)) + times <- sort(unique(veteran$time[veteran$status > 0])) rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) expect_equal(timepoints(rf), times) @@ -172,7 +172,7 @@ test_that("time.interest results in the right time points", { }) test_that("If more unique time points requested then observed, use observed times", { - times <- sort(unique(veteran$time)) + times <- sort(unique(veteran$time[veteran$status > 0])) rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, time.interest = 200) expect_equal(timepoints(rf), times) }) From d083f9bebd80a44798e17963762b60ebcf93dd3a Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 7 Nov 2023 17:14:06 +0100 Subject: [PATCH 086/111] fix other test --- tests/testthat/test_survival.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_survival.R b/tests/testthat/test_survival.R index 8a91bcc52..6676d6e0b 100644 --- a/tests/testthat/test_survival.R +++ b/tests/testthat/test_survival.R @@ -57,7 +57,7 @@ test_that("predict works for single observations, survival", { ## Special tests for random forests for survival analysis test_that("unique death times in survival result is right", { - expect_equal(rg.surv$unique.death.times, sort(unique(veteran$time))) + expect_equal(rg.surv$unique.death.times, sort(unique(veteran$time[veteran$status > 0]))) }) test_that("C-index splitting works", { From 8faf91e0f4fd2547ebb0d45121f24a130a4f7d55 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 7 Nov 2023 21:29:42 +0100 Subject: [PATCH 087/111] add split statistics per node --- R/ranger.R | 2 +- R/treeInfo.R | 6 +++++ man/ranger.Rd | 2 +- man/treeInfo.Rd | 2 ++ src/Forest.h | 7 ++++++ src/Tree.cpp | 1 + src/Tree.h | 4 ++++ src/TreeClassification.cpp | 10 +++++++++ src/TreeProbability.cpp | 10 +++++++++ src/TreeRegression.cpp | 20 +++++++++++++++++ src/TreeSurvival.cpp | 15 +++++++++++++ src/rangerCpp.cpp | 1 + tests/testthat/test_nodestats.R | 39 +++++++++++++++++---------------- 13 files changed, 98 insertions(+), 21 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index 9691dde0b..e2059282f 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -120,7 +120,7 @@ ##' @param num.threads Number of threads. Default is number of CPUs available. ##' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. ##' @param verbose Show computation status and estimated runtime. -##' @param node.stats Save node statistics. Set to \code{TRUE} to save prediction and number of observations for each node. +##' @param node.stats Save node statistics. Set to \code{TRUE} to save prediction, number of observations and split statistics for each node. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. ##' @param dependent.variable.name Name of dependent variable, needed if no formula given. For survival forests this is the time variable. ##' @param status.variable.name Name of status variable, only applicable to survival data and needed if no formula given. Use 1 for event and 0 for censoring. diff --git a/R/treeInfo.R b/R/treeInfo.R index 071399310..184eabd8a 100644 --- a/R/treeInfo.R +++ b/R/treeInfo.R @@ -52,6 +52,8 @@ #' \code{splitval} \tab The splitting value. For numeric or ordinal variables, all values smaller or equal go to the left, larger values to the right. For unordered factor variables see above. \cr #' \code{terminal} \tab Logical, TRUE for terminal nodes. \cr #' \code{prediction} \tab One column with the predicted class (factor) for classification and the predicted numerical value for regression. One probability per class for probability estimation in several columns. Nothing for survival, refer to \code{object$forest$chf} for the CHF node predictions. \cr +#' \code{numSamples} \tab Number of samples in the node (only if ranger called with \code{node.stats = TRUE}). \cr +#' \code{splitStat} \tab Split statistics, i.e., value of the splitting criterion (only if ranger called with \code{node.stats = TRUE}). \cr #' } #' @examples #' rf <- ranger(Species ~ ., data = iris) @@ -164,6 +166,10 @@ treeInfo <- function(object, tree = 1) { if (!is.null(forest$num.samples.nodes)) { result$numSamples <- forest$num.samples.nodes[[tree]] } + if (!is.null(forest$split.stats)) { + result$splitStat <- forest$split.stats[[tree]] + result$splitStat[result$terminal] <- NA + } result } diff --git a/man/ranger.Rd b/man/ranger.Rd index e67b68105..da464a1bd 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -118,7 +118,7 @@ ranger( \item{verbose}{Show computation status and estimated runtime.} -\item{node.stats}{Save node statistics. Set to \code{TRUE} to save prediction and number of observations for each node.} +\item{node.stats}{Save node statistics. Set to \code{TRUE} to save prediction, number of observations and split statistics for each node.} \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed.} diff --git a/man/treeInfo.Rd b/man/treeInfo.Rd index 360330521..cdd125811 100644 --- a/man/treeInfo.Rd +++ b/man/treeInfo.Rd @@ -22,6 +22,8 @@ A data.frame with the columns \code{splitval} \tab The splitting value. For numeric or ordinal variables, all values smaller or equal go to the left, larger values to the right. For unordered factor variables see above. \cr \code{terminal} \tab Logical, TRUE for terminal nodes. \cr \code{prediction} \tab One column with the predicted class (factor) for classification and the predicted numerical value for regression. One probability per class for probability estimation in several columns. Nothing for survival, refer to \code{object$forest$chf} for the CHF node predictions. \cr + \code{numSamples} \tab Number of samples in the node (only if ranger called with \code{node.stats = TRUE}). \cr + \code{splitStat} \tab Split statistics, i.e., value of the splitting criterion (only if ranger called with \code{node.stats = TRUE}). \cr } } \description{ diff --git a/src/Forest.h b/src/Forest.h index 73d782dcc..5b2972023 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -159,6 +159,13 @@ class Forest { } return result; } + std::vector> getSplitStats() { + std::vector> result; + for (auto& tree : trees) { + result.push_back(tree->getSplitStats()); + } + return result; + } protected: void grow(); diff --git a/src/Tree.cpp b/src/Tree.cpp index 542e540a5..57d3dfbdd 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -390,6 +390,7 @@ void Tree::createEmptyNode() { if (save_node_stats) { num_samples_nodes.push_back(0); + split_stats.push_back(0); } createEmptyNodeInternal(); diff --git a/src/Tree.h b/src/Tree.h index 3536ce683..101c300df 100644 --- a/src/Tree.h +++ b/src/Tree.h @@ -82,6 +82,9 @@ class Tree { const std::vector& getNodePredictions() const { return node_predictions; } + const std::vector& getSplitStats() const { + return split_stats; + } protected: void createPossibleSplitVarSubset(std::vector& result); @@ -203,6 +206,7 @@ class Tree { bool save_node_stats; std::vector num_samples_nodes; std::vector node_predictions; + std::vector split_stats; // Holdout mode bool holdout; diff --git a/src/TreeClassification.cpp b/src/TreeClassification.cpp index 23bd8acfe..bbb3b581e 100644 --- a/src/TreeClassification.cpp +++ b/src/TreeClassification.cpp @@ -205,6 +205,11 @@ bool TreeClassification::findBestSplit(size_t nodeID, std::vector& possi // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_decrease; + } // Compute gini index for this node and to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { @@ -564,6 +569,11 @@ bool TreeClassification::findBestSplitExtraTrees(size_t nodeID, std::vector& possible // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_decrease; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { @@ -568,6 +573,11 @@ bool TreeProbability::findBestSplitExtraTrees(size_t nodeID, std::vector // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_decrease; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index abb4120d2..c272695be 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -184,6 +184,11 @@ bool TreeRegression::findBestSplit(size_t nodeID, std::vector& possible_ // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_decrease; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { @@ -513,6 +518,11 @@ bool TreeRegression::findBestSplitMaxstat(size_t nodeID, std::vector& po // If not terminal node save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_maxstat; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { @@ -561,6 +571,11 @@ bool TreeRegression::findBestSplitExtraTrees(size_t nodeID, std::vector& // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_decrease; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { @@ -794,6 +809,11 @@ bool TreeRegression::findBestSplitBeta(size_t nodeID, std::vector& possi // Save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_decrease; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { diff --git a/src/TreeSurvival.cpp b/src/TreeSurvival.cpp index 678522b98..1c60ba8bf 100644 --- a/src/TreeSurvival.cpp +++ b/src/TreeSurvival.cpp @@ -177,6 +177,11 @@ bool TreeSurvival::findBestSplit(size_t nodeID, std::vector& possible_sp // If not terminal node save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_decrease; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { @@ -308,6 +313,11 @@ bool TreeSurvival::findBestSplitMaxstat(size_t nodeID, std::vector& poss // If not terminal node save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_maxstat; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { @@ -734,6 +744,11 @@ bool TreeSurvival::findBestSplitExtraTrees(size_t nodeID, std::vector& p // If not terminal node save best values split_varIDs[nodeID] = best_varID; split_values[nodeID] = best_value; + + // Save split statistics + if (save_node_stats) { + split_stats[nodeID] = best_decrease; + } // Compute decrease of impurity for this node and add to variable importance if needed if (importance_mode == IMP_GINI || importance_mode == IMP_GINI_CORRECTED) { diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index 757381b66..c8c4fed21 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -260,6 +260,7 @@ Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericM if (node_stats) { forest_object.push_back(forest->getNumSamplesNodes(), "num.samples.nodes"); + forest_object.push_back(forest->getSplitStats(), "split.stats"); } if (snp_data.nrow() > 1 && order_snps) { diff --git a/tests/testthat/test_nodestats.R b/tests/testthat/test_nodestats.R index 3470b9271..538563bbd 100644 --- a/tests/testthat/test_nodestats.R +++ b/tests/testthat/test_nodestats.R @@ -8,12 +8,14 @@ test_that("if node.stats FALSE, no nodestats saved, classification", { rf <- ranger(Species ~ ., iris, num.trees = 5) expect_null(rf$forest$num.samples.nodes) expect_null(rf$forest$node.predictions) + expect_null(rf$forest$split.stats) }) test_that("if node.stats FALSE, no nodestats saved, probability", { rf <- ranger(Species ~ ., iris, num.trees = 5, probability = TRUE) expect_null(rf$forest$num.samples.nodes) expect_null(rf$forest$node.predictions) + expect_null(rf$forest$split.stats) expect_length(rf$forest$terminal.class.counts[[1]][[1]], 0) }) @@ -21,12 +23,14 @@ test_that("if node.stats FALSE, no nodestats saved, regression", { rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5) expect_null(rf$forest$num.samples.nodes) expect_null(rf$forest$node.predictions) + expect_null(rf$forest$split.stats) }) test_that("if node.stats FALSE, no nodestats saved, survival", { rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) expect_null(rf$forest$num.samples.nodes) expect_null(rf$forest$node.predictions) + expect_null(rf$forest$split.stats) expect_length(rf$forest$chf[[1]][[1]], 0) }) @@ -40,6 +44,10 @@ test_that("if node.stats TRUE, nodestats saved, classification", { expect_is(rf$forest$node.predictions, "list") expect_length(rf$forest$node.predictions, rf$num.trees) expect_is(rf$forest$node.predictions[[1]], "numeric") + + expect_is(rf$forest$split.stats, "list") + expect_length(rf$forest$split.stats, rf$num.trees) + expect_is(rf$forest$split.stats[[1]], "numeric") }) test_that("if node.stats TRUE, nodestats saved, probability", { @@ -54,6 +62,10 @@ test_that("if node.stats TRUE, nodestats saved, probability", { expect_is(rf$forest$terminal.class.counts, "list") expect_length(rf$forest$terminal.class.counts, rf$num.trees) expect_length(rf$forest$terminal.class.counts[[1]][[1]], nlevels(iris$Species)) + + expect_is(rf$forest$split.stats, "list") + expect_length(rf$forest$split.stats, rf$num.trees) + expect_is(rf$forest$split.stats[[1]], "numeric") }) test_that("if node.stats TRUE, nodestats saved, regression", { @@ -66,6 +78,10 @@ test_that("if node.stats TRUE, nodestats saved, regression", { expect_is(rf$forest$node.predictions, "list") expect_length(rf$forest$node.predictions, rf$num.trees) expect_is(rf$forest$node.predictions[[1]], "numeric") + + expect_is(rf$forest$split.stats, "list") + expect_length(rf$forest$split.stats, rf$num.trees) + expect_is(rf$forest$split.stats[[1]], "numeric") }) test_that("if node.stats TRUE, nodestats saved, survival", { @@ -82,26 +98,11 @@ test_that("if node.stats TRUE, nodestats saved, survival", { expect_is(rf$forest$chf[[1]], "list") expect_is(rf$forest$chf[[1]][[1]], "numeric") expect_length(rf$forest$chf[[1]][[1]], length(rf$unique.death.times)) + + expect_is(rf$forest$split.stats, "list") + expect_length(rf$forest$split.stats, rf$num.trees) + expect_is(rf$forest$split.stats[[1]], "numeric") }) -rf <- ranger(Species ~ ., iris, num.trees = 10, probability = TRUE, node.stats = TRUE) -rf$forest$num.samples.nodes -rf$forest$node.predictions -rf$forest$terminal.class.counts - - -rf <- ranger(Sepal.Length ~ ., iris, num.trees = 10, node.stats = TRUE) -rf$forest$num.samples.nodes -rf$forest$node.predictions - -# Survival - -rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 10, node.stats = TRUE) -rf$forest$num.samples.nodes -rf$forest$node.predictions -rf$forest$chf - - - From fe888b8ebdc14a6ea1542438e84e633d88c07388 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 7 Nov 2023 22:39:01 +0100 Subject: [PATCH 088/111] also save name of status variable name for survival --- R/ranger.R | 11 ++++++++--- man/ranger.Rd | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index 12b4db5ff..9766a356f 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -148,6 +148,7 @@ ##' \item{\code{num.samples}}{Number of samples.} ##' \item{\code{inbag.counts}}{Number of times the observations are in-bag in the trees.} ##' \item{\code{dependent.variable.name}}{Name of the dependent variable. This is NULL when x/y interface is used.} +##' \item{\code{status.variable.name}}{Name of the status variable (survival only). This is NULL when x/y interface is used.} ##' @examples ##' ## Classification forest with default settings ##' ranger(Species ~ ., data = iris) @@ -278,7 +279,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, stop("Error: Invalid formula.") } data.selected <- parse.formula(formula, data, env = parent.frame()) - dependent.variable.name = names(data.selected)[1] + dependent.variable.name <- all.vars(formula)[1] + if (is.Surv(data.selected[, 1])) { + status.variable.name <- all.vars(formula)[2] + } y <- data.selected[, 1] x <- data.selected[, -1, drop = FALSE] } @@ -1004,9 +1008,10 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } } - ## slot: dependent.variable.name + ## Dependent (and status) variable name ## will be NULL only when x/y interface is used - result$dependent.variable.name = dependent.variable.name + result$dependent.variable.name <- dependent.variable.name + result$status.variable.name <- status.variable.name class(result) <- "ranger" diff --git a/man/ranger.Rd b/man/ranger.Rd index ff202a074..361200ae3 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -152,6 +152,8 @@ Object of class \code{ranger} with elements \item{\code{importance.mode}}{Importance mode used.} \item{\code{num.samples}}{Number of samples.} \item{\code{inbag.counts}}{Number of times the observations are in-bag in the trees.} + \item{\code{dependent.variable.name}}{Name of the dependent variable. This is NULL when x/y interface is used.} + \item{\code{status.variable.name}}{Name of the status variable (survival only). This is NULL when x/y interface is used.} } \description{ Ranger is a fast implementation of random forests (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data. From 1bacb3f2594529a322c8e1d658ebc39478f5cec7 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 7 Nov 2023 22:50:41 +0100 Subject: [PATCH 089/111] is.Surv is in survival package --- R/ranger.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ranger.R b/R/ranger.R index 9766a356f..ee24e8bcb 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -280,7 +280,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } data.selected <- parse.formula(formula, data, env = parent.frame()) dependent.variable.name <- all.vars(formula)[1] - if (is.Surv(data.selected[, 1])) { + if (survival::is.Surv(data.selected[, 1])) { status.variable.name <- all.vars(formula)[2] } y <- data.selected[, 1] From 0bf78762907e187b9c94eedd90de3200fc18b4e3 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Tue, 7 Nov 2023 22:50:50 +0100 Subject: [PATCH 090/111] update tests --- tests/testthat/test_classification.R | 4 ++-- tests/testthat/test_print.R | 2 +- tests/testthat/test_regression.R | 4 ++-- tests/testthat/test_survival.R | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test_classification.R b/tests/testthat/test_classification.R index 9f219cf05..0d015c9de 100644 --- a/tests/testthat/test_classification.R +++ b/tests/testthat/test_classification.R @@ -10,9 +10,9 @@ rg.class <- ranger(Species ~ ., data = iris) rg.mat <- ranger(dependent.variable.name = "Species", data = dat, classification = TRUE) ## Basic tests (for all random forests equal) -test_that("classification result is of class ranger with 14 elements", { +test_that("classification result is of class ranger with 15 elements", { expect_is(rg.class, "ranger") - expect_equal(length(rg.class), 14) + expect_equal(length(rg.class), 15) }) test_that("classification prediction returns factor", { diff --git a/tests/testthat/test_print.R b/tests/testthat/test_print.R index fd9f3512b..3ca91b4a8 100644 --- a/tests/testthat/test_print.R +++ b/tests/testthat/test_print.R @@ -16,7 +16,7 @@ expect_that(print(rf$forest), prints_text("Ranger forest object")) expect_that(print(predict(rf, iris)), prints_text("Ranger prediction")) ## Test str ranger function -expect_that(str(rf), prints_text("List of 14")) +expect_that(str(rf), prints_text("List of 15")) ## Test str forest function expect_that(str(rf$forest), prints_text("List of 9")) diff --git a/tests/testthat/test_regression.R b/tests/testthat/test_regression.R index 8bb3d11a6..dd3bdd4ea 100644 --- a/tests/testthat/test_regression.R +++ b/tests/testthat/test_regression.R @@ -7,9 +7,9 @@ context("ranger_reg") rg.reg <- ranger(Sepal.Length ~ ., data = iris) ## Basic tests (for all random forests equal) -test_that("regression result is of class ranger with 14 elements", { +test_that("regression result is of class ranger with 15 elements", { expect_is(rg.reg, "ranger") - expect_equal(length(rg.reg), 14) + expect_equal(length(rg.reg), 15) }) test_that("regression prediction returns numeric vector", { diff --git a/tests/testthat/test_survival.R b/tests/testthat/test_survival.R index 6676d6e0b..6226eb6fb 100644 --- a/tests/testthat/test_survival.R +++ b/tests/testthat/test_survival.R @@ -8,9 +8,9 @@ context("ranger_surv") rg.surv <- ranger(Surv(time, status) ~ ., data = veteran, num.trees = 10) ## Basic tests (for all random forests equal) -test_that("survival result is of class ranger with 15 elements", { +test_that("survival result is of class ranger with 17 elements", { expect_is(rg.surv, "ranger") - expect_equal(length(rg.surv), 15) + expect_equal(length(rg.surv), 17) }) test_that("results have right number of trees", { From fb947d8170b0cda71804ebbf3c8e07912098f1e4 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 8 Nov 2023 06:46:43 +0100 Subject: [PATCH 091/111] add notes on difference with extratrees #408, single decision trees #689 and default values #640 --- R/ranger.R | 13 +++++++++++++ man/ranger.Rd | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/R/ranger.R b/R/ranger.R index ee24e8bcb..750d1bda1 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -65,6 +65,18 @@ ##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. ##' We recommend not to use the 'impurity_corrected' importance when making predictions since the feature permutation step might reduce predictive performance (a warning is raised when predicting on new data). ##' +##' Note that ranger has different default values than other packages. +##' For example, our default for \code{mtry} is the square root of the number of variables for all tree types, whereas other packages use different values for regression. +##' Also, changing one hyperparameter does not change other hyperparameters (where possible). +##' For example, \code{splitrule="extratrees"} uses randomized splitting but does not disable bagging as in Geurts et al. (2006). +##' To disable bagging, use \code{replace = FALSE, sample.fraction = 1}. +##' This can also be used to grow a single decision tree without bagging and feature subsetting: \code{ranger(..., num.trees = 1, mtry = p, replace = FALSE, sample.fraction = 1)}, where p is the number of independent variables. +##' +##' While random forests are known for their robustness, default hyperparameters not always work well. +##' For example, for high dimensional data, increasing the \code{mtry} value and the number of trees \code{num.trees} is recommended. +##' For more details and recommendations, see Probst et al. (2019). +##' To find the best hyperparameters, consider hyperparameter tuning with the \code{tuneRanger} or \code{mlr3} packages. +##' ##' Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. ##' If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. ##' If regularization is used, multithreading is deactivated because all trees need access to the list of variables that are already included in the model. @@ -205,6 +217,7 @@ ##' \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \doi{10.1198/106186008X344522}. ##' \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \doi{10.1023/A:1009869804967}. ##' \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \doi{10.1109/IJCNN.2012.6252640}. +##' \item Probst, P., Wright, M. N. & Boulesteix, A-L. (2019). Hyperparameters and tuning strategies for random forest. WIREs Data Mining Knowl Discov 9:e1301.\doi{10.1002/widm.1301}. ##' } ##' @seealso \code{\link{predict.ranger}} ##' @useDynLib ranger, .registration = TRUE diff --git a/man/ranger.Rd b/man/ranger.Rd index 361200ae3..6d7a3716e 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -196,6 +196,18 @@ It is a modified version of the method by Sandri & Zuccolotto (2008), which is f This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. We recommend not to use the 'impurity_corrected' importance when making predictions since the feature permutation step might reduce predictive performance (a warning is raised when predicting on new data). +Note that ranger has different default values than other packages. +For example, our default for \code{mtry} is the square root of the number of variables for all tree types, whereas other packages use different values for regression. +Also, changing one hyperparameter does not change other hyperparameters (where possible). +For example, \code{splitrule="extratrees"} uses randomized splitting but does not disable bagging as in Geurts et al. (2006). +To disable bagging, use \code{replace = FALSE, sample.fraction = 1}. +This can also be used to grow a single decision tree without bagging and feature subsetting: \code{ranger(..., num.trees = 1, mtry = p, replace = FALSE, sample.fraction = 1)}, where p is the number of independent variables. + +While random forests are known for their robustness, default hyperparameters not always work well. +For example, for high dimensional data, increasing the \code{mtry} value and the number of trees \code{num.trees} is recommended. +For more details and recommendations, see Probst et al. (2019). +To find the best hyperparameters, consider hyperparameter tuning with the \code{tuneRanger} or \code{mlr3} packages. + Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. If regularization is used, multithreading is deactivated because all trees need access to the list of variables that are already included in the model. @@ -272,6 +284,7 @@ ranger(trait ~ ., data = dat.gwaa) \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \doi{10.1198/106186008X344522}. \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \doi{10.1023/A:1009869804967}. \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \doi{10.1109/IJCNN.2012.6252640}. + \item Probst, P., Wright, M. N. & Boulesteix, A-L. (2019). Hyperparameters and tuning strategies for random forest. WIREs Data Mining Knowl Discov 9:e1301.\doi{10.1002/widm.1301}. } } \seealso{ From ccce3ab27729b0c1be2be06bcc4ade8ca156c22b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 8 Nov 2023 07:11:09 +0100 Subject: [PATCH 092/111] improve C++14 error #669 --- src/AAA_check_cpp11.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AAA_check_cpp11.cpp b/src/AAA_check_cpp11.cpp index 5df6e69b0..0b927ccdf 100644 --- a/src/AAA_check_cpp11.cpp +++ b/src/AAA_check_cpp11.cpp @@ -1,6 +1,6 @@ #ifndef WIN_R_BUILD #if __cplusplus < 201402L -#error Error: ranger requires a C++14 compiler, e.g., gcc >= 5 or Clang >= 3.4. You probably have to update your C++ compiler. +#error Error: ranger requires C++14. Possible fixes: 1) Update R, 2) Set "CXX = g++ -std=gnu++11" or similar in local Makevars, 3) update C++ compiler. See https://github.com/imbs-hl/ranger/wiki/FAQ. #endif #endif From e6e3b900539a4f7cfc1943cfe48a4c7ee7107b31 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 8 Nov 2023 07:33:30 +0100 Subject: [PATCH 093/111] add note on C-index calculation --- R/ranger.R | 12 ++++++++---- man/ranger.Rd | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index 750d1bda1..8717c209b 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -76,6 +76,10 @@ ##' For example, for high dimensional data, increasing the \code{mtry} value and the number of trees \code{num.trees} is recommended. ##' For more details and recommendations, see Probst et al. (2019). ##' To find the best hyperparameters, consider hyperparameter tuning with the \code{tuneRanger} or \code{mlr3} packages. +##' +##' Out-of-bag prediction error is calculated as accuracy (proportion of misclassified observations) for classification, as Brier score for probability estimation, as mean squared error (MSE) for regression and as one minus Harrell's C-index for survival. +##' Harrell's C-index is calculated based on the sum of the cumulative hazard function (CHF) over all timepoints, i.e., \code{rowSums(chf)}, where \code{chf} is the the out-of-bag CHF; for details, see Ishwaran et al. (2008). +##' Calculation of the out-of-bag prediction error can be turned off with \code{oob.error = FALSE}. ##' ##' Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. ##' If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. @@ -141,12 +145,12 @@ ##' @param ... Further arguments passed to or from other methods (currently ignored). ##' @return Object of class \code{ranger} with elements ##' \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.} -##' \item{\code{predictions}}{Predicted classes/values, based on out of bag samples (classification and regression only).} +##' \item{\code{predictions}}{Predicted classes/values, based on out-of-bag samples (classification and regression only).} ##' \item{\code{variable.importance}}{Variable importance for each independent variable.} ##' \item{\code{variable.importance.local}}{Variable importance for each independent variable and each sample, if \code{local.importance} is set to TRUE and \code{importance} is set to 'permutation'.} -##' \item{\code{prediction.error}}{Overall out of bag prediction error. For classification this is the fraction of missclassified samples, for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.} -##' \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out of bag data.} -##' \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out of bag samples (classification only).} +##' \item{\code{prediction.error}}{Overall out-of-bag prediction error. For classification this is accuracy (proportion of misclassified observations), for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.} +##' \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out-of-bag data.} +##' \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out-of-bag samples (classification only).} ##' \item{\code{unique.death.times}}{Unique death times (survival only).} ##' \item{\code{chf}}{Estimated cumulative hazard function for each sample (survival only).} ##' \item{\code{survival}}{Estimated survival function for each sample (survival only).} diff --git a/man/ranger.Rd b/man/ranger.Rd index 6d7a3716e..f21b7e824 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -134,12 +134,12 @@ ranger( \value{ Object of class \code{ranger} with elements \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.} - \item{\code{predictions}}{Predicted classes/values, based on out of bag samples (classification and regression only).} + \item{\code{predictions}}{Predicted classes/values, based on out-of-bag samples (classification and regression only).} \item{\code{variable.importance}}{Variable importance for each independent variable.} \item{\code{variable.importance.local}}{Variable importance for each independent variable and each sample, if \code{local.importance} is set to TRUE and \code{importance} is set to 'permutation'.} - \item{\code{prediction.error}}{Overall out of bag prediction error. For classification this is the fraction of missclassified samples, for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.} - \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out of bag data.} - \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out of bag samples (classification only).} + \item{\code{prediction.error}}{Overall out-of-bag prediction error. For classification this is accuracy (proportion of misclassified observations), for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.} + \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out-of-bag data.} + \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out-of-bag samples (classification only).} \item{\code{unique.death.times}}{Unique death times (survival only).} \item{\code{chf}}{Estimated cumulative hazard function for each sample (survival only).} \item{\code{survival}}{Estimated survival function for each sample (survival only).} @@ -208,6 +208,10 @@ For example, for high dimensional data, increasing the \code{mtry} value and the For more details and recommendations, see Probst et al. (2019). To find the best hyperparameters, consider hyperparameter tuning with the \code{tuneRanger} or \code{mlr3} packages. +Out-of-bag prediction error is calculated as accuracy (proportion of misclassified observations) for classification, as Brier score for probability estimation, as mean squared error (MSE) for regression and as one minus Harrell's C-index for survival. +Harrell's C-index is calculated based on the sum of the cumulative hazard function (CHF) over all timepoints, i.e., \code{rowSums(chf)}, where \code{chf} is the the out-of-bag CHF; for details, see Ishwaran et al. (2008). +Calculation of the out-of-bag prediction error can be turned off with \code{oob.error = FALSE}. + Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details. If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node. If regularization is used, multithreading is deactivated because all trees need access to the list of variables that are already included in the model. From 53e6455d49d6e57a4481af952e447613aff4060d Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 8 Nov 2023 08:12:14 +0100 Subject: [PATCH 094/111] use more trees for quantile regression test --- tests/testthat/test_quantreg.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_quantreg.R b/tests/testthat/test_quantreg.R index b2f6ffab6..343c234c5 100644 --- a/tests/testthat/test_quantreg.R +++ b/tests/testthat/test_quantreg.R @@ -2,7 +2,7 @@ library(ranger) context("ranger_quantreg") rf.quant <- ranger(mpg ~ ., mtcars[1:26, ], quantreg = TRUE, - keep.inbag = TRUE, num.trees = 50) + keep.inbag = TRUE, num.trees = 500) pred.quant <- predict(rf.quant, mtcars[27:32, ], type = "quantiles") test_that("Quantile prediction is of correct size", { From 18a193cffda40b8452b5796b7144899c0ad01c32 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 8 Nov 2023 09:26:36 +0100 Subject: [PATCH 095/111] C++14 not C++11 --- src/{AAA_check_cpp11.cpp => AAA_check_cpp14.cpp} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename src/{AAA_check_cpp11.cpp => AAA_check_cpp14.cpp} (75%) diff --git a/src/AAA_check_cpp11.cpp b/src/AAA_check_cpp14.cpp similarity index 75% rename from src/AAA_check_cpp11.cpp rename to src/AAA_check_cpp14.cpp index 0b927ccdf..7d785bc65 100644 --- a/src/AAA_check_cpp11.cpp +++ b/src/AAA_check_cpp14.cpp @@ -1,6 +1,6 @@ #ifndef WIN_R_BUILD #if __cplusplus < 201402L -#error Error: ranger requires C++14. Possible fixes: 1) Update R, 2) Set "CXX = g++ -std=gnu++11" or similar in local Makevars, 3) update C++ compiler. See https://github.com/imbs-hl/ranger/wiki/FAQ. +#error Error: ranger requires C++14. Possible fixes: 1) Update R, 2) Set "CXX = g++ -std=gnu++14" or similar in local Makevars, 3) update C++ compiler. See https://github.com/imbs-hl/ranger/wiki/FAQ. #endif #endif From aaf08f938da1dd7f83226ec30b453f12b9dc784f Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 9 Nov 2023 15:53:14 +0100 Subject: [PATCH 096/111] add a warning when using formula interface with high-dimensional data --- R/ranger.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/ranger.R b/R/ranger.R index 211874b0e..660c82d51 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -296,6 +296,9 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (!inherits(formula, "formula")) { stop("Error: Invalid formula.") } + if (ncol(data) > 10000) { + warning("Avoid the formula interface for high-dimensional data. If ranger is slow or you get a 'protection stack overflow' error, consider the x/y or dependent.variable.name interface (see examples).") + } data.selected <- parse.formula(formula, data, env = parent.frame()) dependent.variable.name <- all.vars(formula)[1] if (survival::is.Surv(data.selected[, 1])) { From 4795a99d43667b47991883fd6ff44e15b2891237 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 9 Nov 2023 16:04:12 +0100 Subject: [PATCH 097/111] new CRAN version --- DESCRIPTION | 4 ++-- NEWS | 7 +++++++ NEWS.md | 3 +++ cpp_version/src/version.h | 2 +- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 48dcd8c5b..c842a425d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.15.4 -Date: 2023-11-07 +Version: 0.16.0 +Date: 2023-11-09 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high diff --git a/NEWS b/NEWS index 9f5f3f6c1..ed9481ca0 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,11 @@ +##### Version 0.16.0 +* Add node.stats option to save node statistics of all nodes +* Add time.interest option to restrict unique survival times (faster and saves memory) +* Fix min bucket option in C++ version +* Fix memory error for always.split.variables in certain settings +* Fix quantile regression for factor variables in "order" mode + ##### Version 0.15.0 * Switch to C++14 standard * Add min.bucket parameter to restrict terminal node size diff --git a/NEWS.md b/NEWS.md index 5d62851cb..ad66d3272 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.16.0 +* New CRAN version + # ranger 0.15.4 * Add node.stats option to save node statistics of all nodes * Add time.interest option to restrict unique survival times (faster and saves memory) diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 2f82b8ee6..3601cfabc 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.15.4" +#define RANGER_VERSION "0.16.0" #endif From 28be15b16ec00f9af169d170bbbceab35f36928b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Fri, 10 Nov 2023 19:33:53 +0100 Subject: [PATCH 098/111] don't require survival package for check --- R/ranger.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ranger.R b/R/ranger.R index 660c82d51..c33da6277 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -301,7 +301,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } data.selected <- parse.formula(formula, data, env = parent.frame()) dependent.variable.name <- all.vars(formula)[1] - if (survival::is.Surv(data.selected[, 1])) { + if (inherits(data.selected[, 1], "Surv")) { status.variable.name <- all.vars(formula)[2] } y <- data.selected[, 1] From 150adac0fb2a0bf9e32525e48b902589aff90ac0 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 22 Nov 2023 16:33:50 +0100 Subject: [PATCH 099/111] add check for inbag element size --- R/ranger.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/ranger.R b/R/ranger.R index c33da6277..8598bfb88 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -659,6 +659,9 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (length(inbag) != num.trees) { stop("Error: Size of inbag list not equal to number of trees.") } + if (any(sapply(inbags, length) != nrow(x))) { + stop("Error: Size of at least one element in inbag not equal to number of samples.") + } } else { stop("Error: Invalid inbag, expects list of vectors of size num.trees.") } From b9792f225b175bec73fc439fe2ef91d53d12bee4 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 22 Nov 2023 16:49:24 +0100 Subject: [PATCH 100/111] inbag not inbags --- R/ranger.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ranger.R b/R/ranger.R index 8598bfb88..6d56d4d45 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -659,7 +659,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (length(inbag) != num.trees) { stop("Error: Size of inbag list not equal to number of trees.") } - if (any(sapply(inbags, length) != nrow(x))) { + if (any(sapply(inbag, length) != nrow(x))) { stop("Error: Size of at least one element in inbag not equal to number of samples.") } } else { From 93dd105707d9a7e681254c3913d22b3ff547b8cc Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 6 Dec 2023 20:47:58 +0100 Subject: [PATCH 101/111] default to 2 threads but give a startup message --- R/onAttach.R | 18 ++++++++++++++++++ R/predict.R | 2 +- R/ranger.R | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 R/onAttach.R diff --git a/R/onAttach.R b/R/onAttach.R new file mode 100644 index 000000000..3f098998e --- /dev/null +++ b/R/onAttach.R @@ -0,0 +1,18 @@ + +.onAttach = function(libname, pkgname) { + if (!interactive()) { + return() + } + + threads_option <- getOption("ranger.num.threads") + threads_env <- Sys.getenv("R_RANGER_NUM_THREADS") + if (!is.null(threads_option)) { + thread_string <- paste(threads_option, "threads (set by options(ranger.num.threads = N).") + } else if (threads_env != "") { + thread_string <- paste(threads_env, "threads (set by environment variable 'R_RANGER_NUM_THREADS').") + } else { + thread_string <- "2 threads (default). Change with num.threads in ranger() and predict(), options(ranger.num.threads = N) or environment variable 'R_RANGER_NUM_THREADS'." + } + + packageStartupMessage(paste("ranger", packageVersion("ranger"), "using", thread_string)) +} diff --git a/R/predict.R b/R/predict.R index 82599ab01..0530654ba 100644 --- a/R/predict.R +++ b/R/predict.R @@ -193,7 +193,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, ## Num threads ## Default 0 -> detect from system in C++. if (is.null(num.threads)) { - num.threads = 0 + num.threads <- as.integer(Sys.getenv("R_RANGER_NUM_THREADS", getOption("ranger.num.threads", 2L))) } else if (!is.numeric(num.threads) || num.threads < 0) { stop("Error: Invalid value for num.threads") } diff --git a/R/ranger.R b/R/ranger.R index 6d56d4d45..dc4dd823f 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -514,7 +514,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, ## Num threads ## Default 0 -> detect from system in C++. if (is.null(num.threads)) { - num.threads = 0 + num.threads <- as.integer(Sys.getenv("R_RANGER_NUM_THREADS", getOption("ranger.num.threads", 2L))) } else if (!is.numeric(num.threads) || num.threads < 0) { stop("Error: Invalid value for num.threads") } From a0e8fcd4464fff30267903e0cc3ca7ae71c2542e Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 6 Dec 2023 20:58:12 +0100 Subject: [PATCH 102/111] improve startup message --- R/onAttach.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/onAttach.R b/R/onAttach.R index 3f098998e..e956271e0 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -7,11 +7,11 @@ threads_option <- getOption("ranger.num.threads") threads_env <- Sys.getenv("R_RANGER_NUM_THREADS") if (!is.null(threads_option)) { - thread_string <- paste(threads_option, "threads (set by options(ranger.num.threads = N).") + thread_string <- paste(threads_option, "threads as set by options(ranger.num.threads = N). Can be overwritten with num.threads.") } else if (threads_env != "") { - thread_string <- paste(threads_env, "threads (set by environment variable 'R_RANGER_NUM_THREADS').") + thread_string <- paste(threads_env, "threads as set by environment variable R_RANGER_NUM_THREADS. Can be overwritten with num.threads.") } else { - thread_string <- "2 threads (default). Change with num.threads in ranger() and predict(), options(ranger.num.threads = N) or environment variable 'R_RANGER_NUM_THREADS'." + thread_string <- "2 threads (default). Change with num.threads in ranger() and predict(), options(ranger.num.threads = N) or environment variable R_RANGER_NUM_THREADS." } packageStartupMessage(paste("ranger", packageVersion("ranger"), "using", thread_string)) From 33ea5da1d3bbe2d20a187c67ffad7d08b270f90d Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 6 Dec 2023 21:14:11 +0100 Subject: [PATCH 103/111] also use Ncpus option --- R/onAttach.R | 12 ++++++++---- R/predict.R | 2 +- R/ranger.R | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/R/onAttach.R b/R/onAttach.R index e956271e0..972d2edfd 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -4,12 +4,16 @@ return() } - threads_option <- getOption("ranger.num.threads") threads_env <- Sys.getenv("R_RANGER_NUM_THREADS") - if (!is.null(threads_option)) { - thread_string <- paste(threads_option, "threads as set by options(ranger.num.threads = N). Can be overwritten with num.threads.") - } else if (threads_env != "") { + threads_option1 <- getOption("ranger.num.threads") + threads_option2 <- getOption("Ncpus") + + if (threads_env != "") { thread_string <- paste(threads_env, "threads as set by environment variable R_RANGER_NUM_THREADS. Can be overwritten with num.threads.") + } else if (!is.null(threads_option1)) { + thread_string <- paste(threads_option1, "threads as set by options(ranger.num.threads = N). Can be overwritten with num.threads.") + } else if (!is.null(threads_option2)) { + thread_string <- paste(threads_option2, "threads as set by options(Ncpus = N). Can be overwritten with num.threads.") } else { thread_string <- "2 threads (default). Change with num.threads in ranger() and predict(), options(ranger.num.threads = N) or environment variable R_RANGER_NUM_THREADS." } diff --git a/R/predict.R b/R/predict.R index 0530654ba..0b45c3ec9 100644 --- a/R/predict.R +++ b/R/predict.R @@ -193,7 +193,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, ## Num threads ## Default 0 -> detect from system in C++. if (is.null(num.threads)) { - num.threads <- as.integer(Sys.getenv("R_RANGER_NUM_THREADS", getOption("ranger.num.threads", 2L))) + num.threads <- as.integer(Sys.getenv("R_RANGER_NUM_THREADS", getOption("ranger.num.threads", getOption("Ncpus", 2L)))) } else if (!is.numeric(num.threads) || num.threads < 0) { stop("Error: Invalid value for num.threads") } diff --git a/R/ranger.R b/R/ranger.R index dc4dd823f..eeaf80bfc 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -514,7 +514,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, ## Num threads ## Default 0 -> detect from system in C++. if (is.null(num.threads)) { - num.threads <- as.integer(Sys.getenv("R_RANGER_NUM_THREADS", getOption("ranger.num.threads", 2L))) + num.threads <- as.integer(Sys.getenv("R_RANGER_NUM_THREADS", getOption("ranger.num.threads", getOption("Ncpus", 2L)))) } else if (!is.numeric(num.threads) || num.threads < 0) { stop("Error: Invalid value for num.threads") } From 8ba5fe8af303263aa61b396e524c80b759113cec Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 6 Dec 2023 21:19:33 +0100 Subject: [PATCH 104/111] add Ncpus to startup message --- R/onAttach.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/onAttach.R b/R/onAttach.R index 972d2edfd..61b69ddaf 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -15,7 +15,7 @@ } else if (!is.null(threads_option2)) { thread_string <- paste(threads_option2, "threads as set by options(Ncpus = N). Can be overwritten with num.threads.") } else { - thread_string <- "2 threads (default). Change with num.threads in ranger() and predict(), options(ranger.num.threads = N) or environment variable R_RANGER_NUM_THREADS." + thread_string <- "2 threads (default). Change with num.threads in ranger() and predict(), options(Ncpus = N), options(ranger.num.threads = N) or environment variable R_RANGER_NUM_THREADS." } packageStartupMessage(paste("ranger", packageVersion("ranger"), "using", thread_string)) From 553aa9a35e34ff77848ea39c42b492fc3e21c69b Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 6 Dec 2023 21:29:17 +0100 Subject: [PATCH 105/111] add documentation for threads --- R/predict.R | 10 ++++++++-- R/ranger.R | 8 ++++---- man/predict.ranger.Rd | 5 ++++- man/predict.ranger.forest.Rd | 5 ++++- man/ranger.Rd | 8 ++++---- 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/R/predict.R b/R/predict.R index 0b45c3ec9..7ed7b0e5e 100644 --- a/R/predict.R +++ b/R/predict.R @@ -36,6 +36,9 @@ ##' ##' For classification and \code{predict.all = TRUE}, a factor levels are returned as numerics. ##' To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \code{rf} is the ranger object. +##' +##' By default, ranger uses 2 threads. The default can be changed with: (1) \code{num.threads} in ranger/predict call, (2) environment variable +##' R_RANGER_NUM_THREADS, (3) \code{options(ranger.num.threads = N)}, (4) \code{options(Ncpus = N)}, with precedence in that order. ##' ##' @title Ranger prediction ##' @param object Ranger \code{ranger.forest} object. @@ -45,7 +48,7 @@ ##' @param type Type of prediction. One of 'response', 'se', 'terminalNodes', 'quantiles' with default 'response'. See below for details. ##' @param se.method Method to compute standard errors. One of 'jack', 'infjack' with default 'infjack'. Only applicable if type = 'se'. See below for details. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode. -##' @param num.threads Number of threads. Default is number of CPUs available. +##' @param num.threads Number of threads. Default is 2 if not set by options/environment variables (see below). ##' @param verbose Verbose output on or off. ##' @param inbag.counts Number of times the observations are in-bag in the trees. ##' @param ... further arguments passed to or from other methods. @@ -433,6 +436,9 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, ##' ##' For classification and \code{predict.all = TRUE}, a factor levels are returned as numerics. ##' To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \code{rf} is the ranger object. +##' +##' By default, ranger uses 2 threads. The default can be changed with: (1) \code{num.threads} in ranger/predict call, (2) environment variable +##' R_RANGER_NUM_THREADS, (3) \code{options(ranger.num.threads = N)}, (4) \code{options(Ncpus = N)}, with precedence in that order. ##' ##' @title Ranger prediction ##' @param object Ranger \code{ranger} object. @@ -444,7 +450,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, ##' @param quantiles Vector of quantiles for quantile prediction. Set \code{type = 'quantiles'} to use. ##' @param what User specified function for quantile prediction used instead of \code{quantile}. Must return numeric vector, see examples. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode. -##' @param num.threads Number of threads. Default is number of CPUs available. +##' @param num.threads Number of threads. Default is 2 if not set by options/environment variables (see below). ##' @param verbose Verbose output on or off. ##' @param ... further arguments passed to or from other methods. ##' @return Object of class \code{ranger.prediction} with elements diff --git a/R/ranger.R b/R/ranger.R index eeaf80bfc..028616c2e 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -96,10 +96,10 @@ ##' To use only the SNPs without sex or other covariates from the phenotype file, use \code{0} on the right hand side of the formula. ##' Note that missing values are treated as an extra category while splitting. ##' -##' See \url{https://github.com/imbs-hl/ranger} for the development version. +##' By default, ranger uses 2 threads. The default can be changed with: (1) \code{num.threads} in ranger/predict call, (2) environment variable +##' R_RANGER_NUM_THREADS, (3) \code{options(ranger.num.threads = N)}, (4) \code{options(Ncpus = N)}, with precedence in that order. ##' -##' With recent R versions, multithreading on Windows platforms should just work. -##' If you compile yourself, the new RTools toolchain is required. +##' See \url{https://github.com/imbs-hl/ranger} for the development version. ##' ##' @title Ranger ##' @param formula Object of class \code{formula} or \code{character} describing the model to fit. Interaction terms supported only for numerical variables. @@ -133,7 +133,7 @@ ##' @param quantreg Prepare quantile prediction as in quantile regression forests (Meinshausen 2006). Regression only. Set \code{keep.inbag = TRUE} to prepare out-of-bag quantile prediction. ##' @param time.interest Time points of interest (survival only). Can be \code{NULL} (default, use all observed time points), a vector of time points or a single number to use as many time points (grid over observed time points). ##' @param oob.error Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests. -##' @param num.threads Number of threads. Default is number of CPUs available. +##' @param num.threads Number of threads. Default is 2 if not set by options/environment variables (see below). ##' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. ##' @param verbose Show computation status and estimated runtime. ##' @param node.stats Save node statistics. Set to \code{TRUE} to save prediction, number of observations and split statistics for each node. diff --git a/man/predict.ranger.Rd b/man/predict.ranger.Rd index 362befcae..1fce859f4 100644 --- a/man/predict.ranger.Rd +++ b/man/predict.ranger.Rd @@ -38,7 +38,7 @@ \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode.} -\item{num.threads}{Number of threads. Default is number of CPUs available.} +\item{num.threads}{Number of threads. Default is 2 if not set by options/environment variables (see below).} \item{verbose}{Verbose output on or off.} @@ -70,6 +70,9 @@ If \code{type = 'se'} is selected, the method to estimate the variances can be c For classification and \code{predict.all = TRUE}, a factor levels are returned as numerics. To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \code{rf} is the ranger object. + +By default, ranger uses 2 threads. The default can be changed with: (1) \code{num.threads} in ranger/predict call, (2) environment variable +R_RANGER_NUM_THREADS, (3) \code{options(ranger.num.threads = N)}, (4) \code{options(Ncpus = N)}, with precedence in that order. } \examples{ ## Classification forest diff --git a/man/predict.ranger.forest.Rd b/man/predict.ranger.forest.Rd index ba018b0e3..0331fc4b0 100644 --- a/man/predict.ranger.forest.Rd +++ b/man/predict.ranger.forest.Rd @@ -33,7 +33,7 @@ \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode.} -\item{num.threads}{Number of threads. Default is number of CPUs available.} +\item{num.threads}{Number of threads. Default is 2 if not set by options/environment variables (see below).} \item{verbose}{Verbose output on or off.} @@ -66,6 +66,9 @@ If \code{type = 'se'} is selected, the method to estimate the variances can be c For classification and \code{predict.all = TRUE}, a factor levels are returned as numerics. To retrieve the corresponding factor levels, use \code{rf$forest$levels}, if \code{rf} is the ranger object. + +By default, ranger uses 2 threads. The default can be changed with: (1) \code{num.threads} in ranger/predict call, (2) environment variable +R_RANGER_NUM_THREADS, (3) \code{options(ranger.num.threads = N)}, (4) \code{options(Ncpus = N)}, with precedence in that order. } \references{ \itemize{ diff --git a/man/ranger.Rd b/man/ranger.Rd index 61c6e5dfa..9e519c16d 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -112,7 +112,7 @@ ranger( \item{oob.error}{Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests.} -\item{num.threads}{Number of threads. Default is number of CPUs available.} +\item{num.threads}{Number of threads. Default is 2 if not set by options/environment variables (see below).} \item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems.} @@ -230,10 +230,10 @@ All SNPs in the \code{GenABEL} object will be used for splitting. To use only the SNPs without sex or other covariates from the phenotype file, use \code{0} on the right hand side of the formula. Note that missing values are treated as an extra category while splitting. -See \url{https://github.com/imbs-hl/ranger} for the development version. +By default, ranger uses 2 threads. The default can be changed with: (1) \code{num.threads} in ranger/predict call, (2) environment variable +R_RANGER_NUM_THREADS, (3) \code{options(ranger.num.threads = N)}, (4) \code{options(Ncpus = N)}, with precedence in that order. -With recent R versions, multithreading on Windows platforms should just work. -If you compile yourself, the new RTools toolchain is required. +See \url{https://github.com/imbs-hl/ranger} for the development version. } \examples{ ## Classification forest with default settings From 009028e91290e9eccb5ff286081c2d03e7e55306 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 6 Dec 2023 21:32:59 +0100 Subject: [PATCH 106/111] add note on num.threads = 0 --- R/predict.R | 4 ++-- R/ranger.R | 2 +- man/predict.ranger.Rd | 2 +- man/predict.ranger.forest.Rd | 2 +- man/ranger.Rd | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/predict.R b/R/predict.R index 7ed7b0e5e..d11c453ee 100644 --- a/R/predict.R +++ b/R/predict.R @@ -48,7 +48,7 @@ ##' @param type Type of prediction. One of 'response', 'se', 'terminalNodes', 'quantiles' with default 'response'. See below for details. ##' @param se.method Method to compute standard errors. One of 'jack', 'infjack' with default 'infjack'. Only applicable if type = 'se'. See below for details. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode. -##' @param num.threads Number of threads. Default is 2 if not set by options/environment variables (see below). +##' @param num.threads Number of threads. Use 0 for all available cores. Default is 2 if not set by options/environment variables (see below). ##' @param verbose Verbose output on or off. ##' @param inbag.counts Number of times the observations are in-bag in the trees. ##' @param ... further arguments passed to or from other methods. @@ -450,7 +450,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, ##' @param quantiles Vector of quantiles for quantile prediction. Set \code{type = 'quantiles'} to use. ##' @param what User specified function for quantile prediction used instead of \code{quantile}. Must return numeric vector, see examples. ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode. -##' @param num.threads Number of threads. Default is 2 if not set by options/environment variables (see below). +##' @param num.threads Number of threads. Use 0 for all available cores. Default is 2 if not set by options/environment variables (see below). ##' @param verbose Verbose output on or off. ##' @param ... further arguments passed to or from other methods. ##' @return Object of class \code{ranger.prediction} with elements diff --git a/R/ranger.R b/R/ranger.R index 028616c2e..6fa95ddf3 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -133,7 +133,7 @@ ##' @param quantreg Prepare quantile prediction as in quantile regression forests (Meinshausen 2006). Regression only. Set \code{keep.inbag = TRUE} to prepare out-of-bag quantile prediction. ##' @param time.interest Time points of interest (survival only). Can be \code{NULL} (default, use all observed time points), a vector of time points or a single number to use as many time points (grid over observed time points). ##' @param oob.error Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests. -##' @param num.threads Number of threads. Default is 2 if not set by options/environment variables (see below). +##' @param num.threads Number of threads. Use 0 for all available cores. Default is 2 if not set by options/environment variables (see below). ##' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. ##' @param verbose Show computation status and estimated runtime. ##' @param node.stats Save node statistics. Set to \code{TRUE} to save prediction, number of observations and split statistics for each node. diff --git a/man/predict.ranger.Rd b/man/predict.ranger.Rd index 1fce859f4..2f9c63ac3 100644 --- a/man/predict.ranger.Rd +++ b/man/predict.ranger.Rd @@ -38,7 +38,7 @@ \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode.} -\item{num.threads}{Number of threads. Default is 2 if not set by options/environment variables (see below).} +\item{num.threads}{Number of threads. Use 0 for all available cores. Default is 2 if not set by options/environment variables (see below).} \item{verbose}{Verbose output on or off.} diff --git a/man/predict.ranger.forest.Rd b/man/predict.ranger.forest.Rd index 0331fc4b0..805effda4 100644 --- a/man/predict.ranger.forest.Rd +++ b/man/predict.ranger.forest.Rd @@ -33,7 +33,7 @@ \item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. The seed is used in case of ties in classification mode.} -\item{num.threads}{Number of threads. Default is 2 if not set by options/environment variables (see below).} +\item{num.threads}{Number of threads. Use 0 for all available cores. Default is 2 if not set by options/environment variables (see below).} \item{verbose}{Verbose output on or off.} diff --git a/man/ranger.Rd b/man/ranger.Rd index 9e519c16d..4b1f61fe3 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -112,7 +112,7 @@ ranger( \item{oob.error}{Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests.} -\item{num.threads}{Number of threads. Default is 2 if not set by options/environment variables (see below).} +\item{num.threads}{Number of threads. Use 0 for all available cores. Default is 2 if not set by options/environment variables (see below).} \item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems.} From dc09de6d9c11e43a27e9f8c855e4d2eb7a5371a7 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Mon, 8 Jan 2024 06:49:22 +0100 Subject: [PATCH 107/111] save max.depth in ranger object --- R/ranger.R | 5 +++++ tests/testthat/test_classification.R | 4 ++-- tests/testthat/test_print.R | 2 +- tests/testthat/test_regression.R | 4 ++-- tests/testthat/test_survival.R | 4 ++-- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/R/ranger.R b/R/ranger.R index 6d56d4d45..fc78e4d02 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -1037,6 +1037,11 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, result$dependent.variable.name <- dependent.variable.name result$status.variable.name <- status.variable.name + ## Save max.depth + if (!is.null(max.depth)) { + result$max.depth <- max.depth + } + class(result) <- "ranger" ## Prepare quantile prediction diff --git a/tests/testthat/test_classification.R b/tests/testthat/test_classification.R index 0d015c9de..4690afe58 100644 --- a/tests/testthat/test_classification.R +++ b/tests/testthat/test_classification.R @@ -10,9 +10,9 @@ rg.class <- ranger(Species ~ ., data = iris) rg.mat <- ranger(dependent.variable.name = "Species", data = dat, classification = TRUE) ## Basic tests (for all random forests equal) -test_that("classification result is of class ranger with 15 elements", { +test_that("classification result is of class ranger with 16 elements", { expect_is(rg.class, "ranger") - expect_equal(length(rg.class), 15) + expect_equal(length(rg.class), 16) }) test_that("classification prediction returns factor", { diff --git a/tests/testthat/test_print.R b/tests/testthat/test_print.R index 3ca91b4a8..8563b1e31 100644 --- a/tests/testthat/test_print.R +++ b/tests/testthat/test_print.R @@ -16,7 +16,7 @@ expect_that(print(rf$forest), prints_text("Ranger forest object")) expect_that(print(predict(rf, iris)), prints_text("Ranger prediction")) ## Test str ranger function -expect_that(str(rf), prints_text("List of 15")) +expect_that(str(rf), prints_text("List of 16")) ## Test str forest function expect_that(str(rf$forest), prints_text("List of 9")) diff --git a/tests/testthat/test_regression.R b/tests/testthat/test_regression.R index dd3bdd4ea..8949f82d0 100644 --- a/tests/testthat/test_regression.R +++ b/tests/testthat/test_regression.R @@ -7,9 +7,9 @@ context("ranger_reg") rg.reg <- ranger(Sepal.Length ~ ., data = iris) ## Basic tests (for all random forests equal) -test_that("regression result is of class ranger with 15 elements", { +test_that("regression result is of class ranger with 16 elements", { expect_is(rg.reg, "ranger") - expect_equal(length(rg.reg), 15) + expect_equal(length(rg.reg), 16) }) test_that("regression prediction returns numeric vector", { diff --git a/tests/testthat/test_survival.R b/tests/testthat/test_survival.R index 6226eb6fb..358a40960 100644 --- a/tests/testthat/test_survival.R +++ b/tests/testthat/test_survival.R @@ -8,9 +8,9 @@ context("ranger_surv") rg.surv <- ranger(Surv(time, status) ~ ., data = veteran, num.trees = 10) ## Basic tests (for all random forests equal) -test_that("survival result is of class ranger with 17 elements", { +test_that("survival result is of class ranger with 18 elements", { expect_is(rg.surv, "ranger") - expect_equal(length(rg.surv), 17) + expect_equal(length(rg.surv), 18) }) test_that("results have right number of trees", { From b1fc1fdfff726b9ed871fe799a2e85395bfb3bfd Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 15 May 2024 17:04:23 +0200 Subject: [PATCH 108/111] allow vector min.node.size/min.bucket for class-wise limits --- DESCRIPTION | 6 +- NEWS.md | 3 + R/ranger.R | 53 +++++++++++++--- man/ranger.Rd | 4 +- src/Forest.cpp | 15 +++-- src/Forest.h | 12 ++-- src/ForestClassification.cpp | 8 +-- src/ForestProbability.cpp | 8 +-- src/ForestRegression.cpp | 8 +-- src/ForestSurvival.cpp | 8 +-- src/RcppExports.cpp | 6 +- src/Tree.cpp | 6 +- src/Tree.h | 6 +- src/TreeClassification.cpp | 113 ++++++++++++++++++++++++++++++++--- src/TreeProbability.cpp | 111 +++++++++++++++++++++++++++++++--- src/TreeRegression.cpp | 20 +++---- src/TreeSurvival.cpp | 20 +++---- src/rangerCpp.cpp | 2 +- tests/testthat/test_ranger.R | 98 ++++++++++++++++++++++++++++++ 19 files changed, 420 insertions(+), 87 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c842a425d..978e2b991 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.16.0 -Date: 2023-11-09 +Version: 0.16.1 +Date: 2024-05-15 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright Description: A fast implementation of Random Forests, particularly suited for high @@ -19,7 +19,7 @@ Suggests: survival, testthat Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 URL: http://imbs-hl.github.io/ranger/, https://github.com/imbs-hl/ranger BugReports: https://github.com/imbs-hl/ranger/issues diff --git a/NEWS.md b/NEWS.md index ad66d3272..a583b20cb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +# ranger 0.16.1 +* Allow vector min.node.size and min.bucket for class-specific limits + # ranger 0.16.0 * New CRAN version diff --git a/R/ranger.R b/R/ranger.R index 6d56d4d45..65e20df8c 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -109,8 +109,8 @@ ##' @param importance Variable importance mode, one of 'none', 'impurity', 'impurity_corrected', 'permutation'. The 'impurity' measure is the Gini index for classification, the variance of the responses for regression and the sum of test statistics (see \code{splitrule}) for survival. ##' @param write.forest Save \code{ranger.forest} object, required for prediction. Set to \code{FALSE} to reduce memory usage if no prediction intended. ##' @param probability Grow a probability forest as in Malley et al. (2012). -##' @param min.node.size Minimal node size to split at. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability. -##' @param min.bucket Minimal terminal node size. No nodes smaller than this value can occur. Default 3 for survival and 1 for all other tree types. +##' @param min.node.size Minimal node size to split at. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability. For classification, this can be a vector of class-specific values. +##' @param min.bucket Minimal terminal node size. No nodes smaller than this value can occur. Default 3 for survival and 1 for all other tree types. For classification, this can be a vector of class-specific values. ##' @param max.depth Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree). ##' @param replace Sample with replacement. ##' @param sample.fraction Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values. @@ -359,6 +359,15 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, stop("Error: Unsupported type of dependent variable.") } + ## Number of levels + if (treetype %in% c(1, 9)) { + if (is.factor(y)) { + num_levels <- nlevels(y) + } else { + num_levels <- length(unique(y)) + } + } + ## Quantile prediction only for regression if (quantreg && treetype != 3) { stop("Error: Quantile prediction implemented only for regression outcomes.") @@ -522,16 +531,46 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, ## Minimum node size if (is.null(min.node.size)) { min.node.size <- 0 - } else if (!is.numeric(min.node.size) || min.node.size < 0) { - stop("Error: Invalid value for min.node.size") + } else if (!is.numeric(min.node.size)) { + stop("Error: Invalid value for min.node.size.") + } + if (length(min.node.size) > 1) { + if (!(treetype %in% c(1, 9))) { + stop("Error: Invalid value for min.node.size. Vector values only valid for classification forests.") + } + if (any(min.node.size < 0)) { + stop("Error: Invalid value for min.node.size. Please give a nonnegative value or a vector of nonnegative values.") + } + if (length(min.node.size) != num_levels) { + stop("Error: Invalid value for min.node.size Expecting ", num_levels, " values, provided ", length(min.node.size), ".") + } + } else { + if (min.node.size < 0) { + stop("Error: Invalid value for min.node.size. Please give a nonnegative value or a vector of nonnegative values.") + } } ## Minimum bucket size if (is.null(min.bucket)) { min.bucket <- 0 - } else if (!is.numeric(min.bucket) || min.bucket < 0) { + } else if (!is.numeric(min.bucket)) { stop("Error: Invalid value for min.bucket") } + if (length(min.bucket) > 1) { + if (!(treetype %in% c(1, 9))) { + stop("Error: Invalid value for min.bucket Vector values only valid for classification forests.") + } + if (any(min.bucket < 0)) { + stop("Error: Invalid value for min.bucket Please give a nonnegative value or a vector of nonnegative values.") + } + if (length(min.bucket) != num_levels) { + stop("Error: Invalid value for min.bucket Expecting ", num_levels, " values, provided ", length(min.bucket), ".") + } + } else { + if (min.bucket < 0) { + stop("Error: Invalid value for min.bucket Please give a nonnegative value or a vector of nonnegative values.") + } + } ## Tree depth if (is.null(max.depth)) { @@ -554,8 +593,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (sum(sample.fraction) <= 0) { stop("Error: Invalid value for sample.fraction. Sum of values must be >0.") } - if (length(sample.fraction) != nlevels(y)) { - stop("Error: Invalid value for sample.fraction. Expecting ", nlevels(y), " values, provided ", length(sample.fraction), ".") + if (length(sample.fraction) != num_levels) { + stop("Error: Invalid value for sample.fraction. Expecting ", num_levels, " values, provided ", length(sample.fraction), ".") } if (!replace & any(sample.fraction * length(y) > table(y))) { idx <- which(sample.fraction * length(y) > table(y))[1] diff --git a/man/ranger.Rd b/man/ranger.Rd index 61c6e5dfa..6b7465b8a 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -64,9 +64,9 @@ ranger( \item{probability}{Grow a probability forest as in Malley et al. (2012).} -\item{min.node.size}{Minimal node size to split at. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability.} +\item{min.node.size}{Minimal node size to split at. Default 1 for classification, 5 for regression, 3 for survival, and 10 for probability. For classification, this can be a vector of class-specific values.} -\item{min.bucket}{Minimal terminal node size. No nodes smaller than this value can occur. Default 3 for survival and 1 for all other tree types.} +\item{min.bucket}{Minimal terminal node size. No nodes smaller than this value can occur. Default 3 for survival and 1 for all other tree types. For classification, this can be a vector of class-specific values.} \item{max.depth}{Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree).} diff --git a/src/Forest.cpp b/src/Forest.cpp index 8c7a42422..7100cca36 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -27,7 +27,7 @@ namespace ranger { Forest::Forest() : - verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), min_bucket(0), num_independent_variables(0), seed(0), num_samples( + verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size({0}), min_bucket({0}), num_independent_variables(0), seed(0), num_samples( 0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), memory_saving_splitting( false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( { 1 }), holdout( false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth( @@ -62,6 +62,9 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode if (!load_forest_filename.empty()) { prediction_mode = true; } + + std::vector min_node_size_vector = { min_node_size }; + std::vector min_bucket_vector = { min_bucket }; // Sample fraction default and convert to vector if (sample_fraction == 0) { @@ -79,7 +82,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode // Call other init function init(loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, - min_node_size, min_bucket, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, + min_node_size_vector, min_bucket_vector, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, false, max_depth, regularization_factor, regularization_usedepth, false); @@ -133,7 +136,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode // #nocov end void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, - uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, + uint num_threads, ImportanceMode importance_mode, std::vector& min_node_size, std::vector& min_bucket, std::vector>& split_select_weights, const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, @@ -178,7 +181,7 @@ void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, } void Forest::init(std::unique_ptr input_data, uint mtry, std::string output_prefix, - uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, std::vector& min_node_size, std::vector& min_bucket, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, @@ -323,7 +326,7 @@ void Forest::writeOutput() { *verbose_out << "Sample size: " << num_samples << std::endl; *verbose_out << "Number of independent variables: " << num_independent_variables << std::endl; *verbose_out << "Mtry: " << mtry << std::endl; - *verbose_out << "Target node size: " << min_node_size << std::endl; + *verbose_out << "Target node size: " << min_node_size[0] << std::endl; *verbose_out << "Variable importance mode: " << importance_mode << std::endl; *verbose_out << "Memory mode: " << memory_mode << std::endl; *verbose_out << "Seed: " << seed << std::endl; @@ -473,7 +476,7 @@ void Forest::grow() { } trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, tree_split_select_weights, - importance_mode, min_node_size, min_bucket, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, + importance_mode, &min_node_size, &min_bucket, sample_with_replacement, memory_saving_splitting, splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, num_random_splits, max_depth, ®ularization_factor, regularization_usedepth, &split_varIDs_used, save_node_stats); } diff --git a/src/Forest.h b/src/Forest.h index 5b2972023..e92791543 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -48,7 +48,7 @@ class Forest { bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth, const std::vector& regularization_factor, bool regularization_usedepth); void initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, - uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, + uint num_threads, ImportanceMode importance_mode, std::vector& min_node_size, std::vector& min_bucket, std::vector>& split_select_weights, const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, @@ -58,7 +58,7 @@ class Forest { const std::vector& regularization_factor, bool regularization_usedepth, bool node_stats); void init(std::unique_ptr input_data, uint mtry, std::string output_prefix, - uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, std::vector& min_node_size, std::vector& min_bucket, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, @@ -119,10 +119,10 @@ class Forest { uint getMtry() const { return mtry; } - uint getMinNodeSize() const { + const std::vector& getMinNodeSize() const { return min_node_size; } - uint getMinBucket() const { + const std::vector& getMinBucket() const { return min_bucket; } size_t getNumIndependentVariables() const { @@ -209,8 +209,8 @@ class Forest { std::vector dependent_variable_names; // time,status for survival size_t num_trees; uint mtry; - uint min_node_size; - uint min_bucket; + std::vector min_node_size; + std::vector min_bucket; size_t num_independent_variables; uint seed; size_t num_samples; diff --git a/src/ForestClassification.cpp b/src/ForestClassification.cpp index bb6861f79..1f26aeb33 100644 --- a/src/ForestClassification.cpp +++ b/src/ForestClassification.cpp @@ -54,13 +54,13 @@ void ForestClassification::initInternal() { } // Set minimal node size - if (min_node_size == 0) { - min_node_size = DEFAULT_MIN_NODE_SIZE_CLASSIFICATION; + if (min_node_size.size() == 1 && min_node_size[0] == 0) { + min_node_size[0] = DEFAULT_MIN_NODE_SIZE_CLASSIFICATION; } // Set minimal bucket size - if (min_bucket == 0) { - min_bucket = DEFAULT_MIN_BUCKET; + if (min_bucket.size() == 1 && min_bucket[0] == 0) { + min_bucket[0] = DEFAULT_MIN_BUCKET; } // Create class_values and response_classIDs diff --git a/src/ForestProbability.cpp b/src/ForestProbability.cpp index 40922554f..817b2de55 100644 --- a/src/ForestProbability.cpp +++ b/src/ForestProbability.cpp @@ -59,13 +59,13 @@ void ForestProbability::initInternal() { } // Set minimal node size - if (min_node_size == 0) { - min_node_size = DEFAULT_MIN_NODE_SIZE_PROBABILITY; + if (min_node_size.size() == 1 && min_node_size[0] == 0) { + min_node_size[0] = DEFAULT_MIN_NODE_SIZE_PROBABILITY; } // Set minimal bucket size - if (min_bucket == 0) { - min_bucket = DEFAULT_MIN_BUCKET; + if (min_bucket.size() == 1 && min_bucket[0] == 0) { + min_bucket[0] = DEFAULT_MIN_BUCKET; } // Create class_values and response_classIDs diff --git a/src/ForestRegression.cpp b/src/ForestRegression.cpp index 7c1bb3269..6328ac207 100644 --- a/src/ForestRegression.cpp +++ b/src/ForestRegression.cpp @@ -48,13 +48,13 @@ void ForestRegression::initInternal() { } // Set minimal node size - if (min_node_size == 0) { - min_node_size = DEFAULT_MIN_NODE_SIZE_REGRESSION; + if (min_node_size.size() == 1 && min_node_size[0] == 0) { + min_node_size[0] = DEFAULT_MIN_NODE_SIZE_REGRESSION; } // Set minimal bucket size - if (min_bucket == 0) { - min_bucket = DEFAULT_MIN_BUCKET; + if (min_bucket.size() == 1 && min_bucket[0] == 0) { + min_bucket[0] = DEFAULT_MIN_BUCKET; } // Error if beta splitrule used with data outside of [0,1] diff --git a/src/ForestSurvival.cpp b/src/ForestSurvival.cpp index 3b88e3122..9a31f3da3 100644 --- a/src/ForestSurvival.cpp +++ b/src/ForestSurvival.cpp @@ -98,13 +98,13 @@ void ForestSurvival::initInternal() { } // Set minimal node size - if (min_node_size == 0) { - min_node_size = DEFAULT_MIN_NODE_SIZE_SURVIVAL; + if (min_node_size.size() == 1 && min_node_size[0] == 0) { + min_node_size[0] = DEFAULT_MIN_NODE_SIZE_SURVIVAL; } // Set minimal bucket size - if (min_bucket == 0) { - min_bucket = DEFAULT_MIN_BUCKET_SURVIVAL; + if (min_bucket.size() == 1 && min_bucket[0] == 0) { + min_bucket[0] = DEFAULT_MIN_BUCKET_SURVIVAL; } // Sort data if extratrees and not memory saving mode diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a47c22347..254fbe782 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -13,7 +13,7 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // rangerCpp -Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, uint min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, bool node_stats, std::vector& time_interest, bool use_time_interest); +Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, std::vector& min_node_size, std::vector& min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag, std::vector& regularization_factor, bool use_regularization_factor, bool regularization_usedepth, bool node_stats, std::vector& time_interest, bool use_time_interest); RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP min_bucketSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP, SEXP regularization_factorSEXP, SEXP use_regularization_factorSEXP, SEXP regularization_usedepthSEXP, SEXP node_statsSEXP, SEXP time_interestSEXP, SEXP use_time_interestSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -29,8 +29,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< uint >::type num_threads(num_threadsSEXP); Rcpp::traits::input_parameter< bool >::type write_forest(write_forestSEXP); Rcpp::traits::input_parameter< uint >::type importance_mode_r(importance_mode_rSEXP); - Rcpp::traits::input_parameter< uint >::type min_node_size(min_node_sizeSEXP); - Rcpp::traits::input_parameter< uint >::type min_bucket(min_bucketSEXP); + Rcpp::traits::input_parameter< std::vector& >::type min_node_size(min_node_sizeSEXP); + Rcpp::traits::input_parameter< std::vector& >::type min_bucket(min_bucketSEXP); Rcpp::traits::input_parameter< std::vector>& >::type split_select_weights(split_select_weightsSEXP); Rcpp::traits::input_parameter< bool >::type use_split_select_weights(use_split_select_weightsSEXP); Rcpp::traits::input_parameter< std::vector& >::type always_split_variable_names(always_split_variable_namesSEXP); diff --git a/src/Tree.cpp b/src/Tree.cpp index 57d3dfbdd..fd97f686d 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -39,7 +39,7 @@ Tree::Tree(std::vector>& child_nodeIDs, std::vector& } void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, - std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, + std::vector* split_select_weights, ImportanceMode importance_mode, std::vector* min_node_size, std::vector* min_bucket, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, uint max_depth, std::vector* regularization_factor, @@ -90,7 +90,7 @@ void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std: void Tree::grow(std::vector* variable_importance) { // Allocate memory for tree growing allocateMemory(); - + this->variable_importance = variable_importance; // Bootstrap, dependent if weighted or not and with or without replacement @@ -307,7 +307,7 @@ void Tree::createPossibleSplitVarSubset(std::vector& result) { } bool Tree::splitNode(size_t nodeID) { - + // Select random subset of variables to possibly split at std::vector possible_split_varIDs; createPossibleSplitVarSubset(possible_split_varIDs); diff --git a/src/Tree.h b/src/Tree.h index 101c300df..17b98ff0f 100644 --- a/src/Tree.h +++ b/src/Tree.h @@ -36,7 +36,7 @@ class Tree { Tree& operator=(const Tree&) = delete; void init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, - std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, uint min_bucket, + std::vector* split_select_weights, ImportanceMode importance_mode, std::vector* min_node_size, std::vector* min_bucket, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, std::vector* manual_inbag, bool keep_inbag, std::vector* sample_fraction, double alpha, double minprop, bool holdout, uint num_random_splits, @@ -166,10 +166,10 @@ class Tree { size_t num_samples_oob; // Minimum node size to split, nodes of smaller size can be produced - uint min_node_size; + std::vector* min_node_size; // Minimum bucket size, minimum number of samples in each node - uint min_bucket; + std::vector* min_bucket; // Weight vector for selecting possible split variables, one weight between 0 (never select) and 1 (always select) for each variable // Deterministic variables are always selected diff --git a/src/TreeClassification.cpp b/src/TreeClassification.cpp index bbb3b581e..28ca371fb 100644 --- a/src/TreeClassification.cpp +++ b/src/TreeClassification.cpp @@ -85,7 +85,7 @@ bool TreeClassification::splitNodeInternal(size_t nodeID, std::vector& p } // Stop if maximum node size or depth reached - if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { + if ((min_node_size->size() == 1 && num_samples_node <= (*min_node_size)[0]) || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { split_values[nodeID] = estimate(nodeID); return true; } @@ -166,9 +166,19 @@ bool TreeClassification::findBestSplit(size_t nodeID, std::vector& possi uint sample_classID = (*response_classIDs)[sampleID]; ++class_counts[sample_classID]; } + + // Stop if class-wise minimal node size reached + if (min_node_size->size() > 1) { + for (size_t j = 0; j < num_classes; ++j) { + if (class_counts[j] < (*min_node_size)[j]) { + return true; + } + } + } -// Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + // TODO: Possible to stop early for class-wise min_bucket? + // Stop early if no split posssible + if (min_bucket->size() > 1 || (num_samples_node >= 2 * (*min_bucket)[0])) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -283,7 +293,7 @@ void TreeClassification::findBestSplitValueSmallQ(size_t nodeID, size_t varID, s } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0])) { continue; } @@ -317,6 +327,21 @@ void TreeClassification::findBestSplitValueSmallQ(size_t nodeID, size_t varID, s // Decrease of impurity decrease = sum_right / (double) n_right + sum_left / (double) n_left; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_right = class_counts[j] - class_counts_left[j]; + if (class_counts_left[j] < (*min_bucket)[j] || class_count_right < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Regularization regularize(decrease, varID); @@ -375,7 +400,7 @@ void TreeClassification::findBestSplitValueLargeQ(size_t nodeID, size_t varID, s } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0])) { continue; } @@ -409,6 +434,21 @@ void TreeClassification::findBestSplitValueLargeQ(size_t nodeID, size_t varID, s // Decrease of impurity decrease = sum_right / (double) n_right + sum_left / (double) n_left; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_right = class_counts[j] - class_counts_left[j]; + if (class_counts_left[j] < (*min_bucket)[j] || class_count_right < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Regularization regularize(decrease, varID); @@ -486,7 +526,7 @@ void TreeClassification::findBestSplitValueUnordered(size_t nodeID, size_t varID size_t n_left = num_samples_node - n_right; // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0])) { continue; } @@ -516,6 +556,21 @@ void TreeClassification::findBestSplitValueUnordered(size_t nodeID, size_t varID // Decrease of impurity decrease = sum_left / (double) n_left + sum_right / (double) n_right; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_left = class_counts[j] - class_counts_right[j]; + if (class_count_left < (*min_bucket)[j] || class_counts_right[j] < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Regularization regularize(decrease, varID); @@ -544,9 +599,19 @@ bool TreeClassification::findBestSplitExtraTrees(size_t nodeID, std::vectorsize() > 1) { + for (size_t j = 0; j < num_classes; ++j) { + if (class_counts[j] < (*min_node_size)[j]) { + return true; + } + } + } + // TODO: Possible to stop early for class-wise min_bucket? // Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + if (min_bucket->size() > 1 || (num_samples_node >= 2 * (*min_bucket)[0])) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -657,7 +722,7 @@ void TreeClassification::findBestSplitValueExtraTrees(size_t nodeID, size_t varI } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right[i] < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right[i] < (*min_bucket)[0])) { continue; } @@ -671,6 +736,21 @@ void TreeClassification::findBestSplitValueExtraTrees(size_t nodeID, size_t varI sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_left = class_counts[j] - class_counts_right[j]; + if (class_count_left < (*min_bucket)[j] || class_counts_right[j] < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Decrease of impurity double decrease = sum_left / (double) n_left + sum_right / (double) n_right[i]; @@ -768,7 +848,7 @@ void TreeClassification::findBestSplitValueExtraTreesUnordered(size_t nodeID, si size_t n_left = num_samples_node - n_right; // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0])) { continue; } @@ -782,6 +862,21 @@ void TreeClassification::findBestSplitValueExtraTreesUnordered(size_t nodeID, si sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_left = class_counts[j] - class_counts_right[j]; + if (class_count_left < (*min_bucket)[j] || class_counts_right[j] < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Decrease of impurity double decrease = sum_left / (double) n_left + sum_right / (double) n_right; diff --git a/src/TreeProbability.cpp b/src/TreeProbability.cpp index 53d630b7c..5cb61f467 100644 --- a/src/TreeProbability.cpp +++ b/src/TreeProbability.cpp @@ -89,7 +89,7 @@ bool TreeProbability::splitNodeInternal(size_t nodeID, std::vector& poss } // Stop if maximum node size or depth reached - if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { + if ((min_node_size->size() == 1 && num_samples_node <= (*min_node_size)[0]) || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { if (!save_node_stats) { addToTerminalNodes(nodeID); } @@ -170,9 +170,19 @@ bool TreeProbability::findBestSplit(size_t nodeID, std::vector& possible uint sample_classID = (*response_classIDs)[sampleID]; ++class_counts[sample_classID]; } + + // Stop if class-wise minimal node size reached + if (min_node_size->size() > 1) { + for (size_t j = 0; j < num_classes; ++j) { + if (class_counts[j] < (*min_node_size)[j]) { + return true; + } + } + } + // TODO: Possible to stop early for class-wise min_bucket? // Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + if (min_bucket->size() > 1 || (num_samples_node >= 2 * (*min_bucket)[0])) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -287,7 +297,7 @@ void TreeProbability::findBestSplitValueSmallQ(size_t nodeID, size_t varID, size } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0])) { continue; } @@ -321,6 +331,21 @@ void TreeProbability::findBestSplitValueSmallQ(size_t nodeID, size_t varID, size // Decrease of impurity decrease = sum_right / (double) n_right + sum_left / (double) n_left; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_right = class_counts[j] - class_counts_left[j]; + if (class_counts_left[j] < (*min_bucket)[j] || class_count_right < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Regularization regularize(decrease, varID); @@ -379,7 +404,7 @@ void TreeProbability::findBestSplitValueLargeQ(size_t nodeID, size_t varID, size } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0])) { continue; } @@ -413,6 +438,21 @@ void TreeProbability::findBestSplitValueLargeQ(size_t nodeID, size_t varID, size // Decrease of impurity decrease = sum_right / (double) n_right + sum_left / (double) n_left; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_right = class_counts[j] - class_counts_left[j]; + if (class_counts_left[j] < (*min_bucket)[j] || class_count_right < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Regularization regularize(decrease, varID); @@ -490,7 +530,7 @@ void TreeProbability::findBestSplitValueUnordered(size_t nodeID, size_t varID, s size_t n_left = num_samples_node - n_right; // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0])) { continue; } @@ -520,6 +560,21 @@ void TreeProbability::findBestSplitValueUnordered(size_t nodeID, size_t varID, s // Decrease of impurity decrease = sum_left / (double) n_left + sum_right / (double) n_right; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_left = class_counts[j] - class_counts_right[j]; + if (class_count_left < (*min_bucket)[j] || class_counts_right[j] < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Regularization regularize(decrease, varID); @@ -548,9 +603,19 @@ bool TreeProbability::findBestSplitExtraTrees(size_t nodeID, std::vector uint sample_classID = (*response_classIDs)[sampleID]; ++class_counts[sample_classID]; } + + // Stop if class-wise minimal node size reached + if (min_node_size->size() > 1) { + for (size_t j = 0; j < num_classes; ++j) { + if (class_counts[j] < (*min_node_size)[j]) { + return true; + } + } + } + // TODO: Possible to stop early for class-wise min_bucket? // Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + if (min_bucket->size() > 1 || (num_samples_node >= 2 * (*min_bucket)[0])) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -661,7 +726,7 @@ void TreeProbability::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right[i] < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right[i] < (*min_bucket)[0])) { continue; } @@ -675,6 +740,21 @@ void TreeProbability::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_left = class_counts[j] - class_counts_right[j]; + if (class_count_left < (*min_bucket)[j] || class_counts_right[j] < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Decrease of impurity double decrease = sum_left / (double) n_left + sum_right / (double) n_right[i]; @@ -772,7 +852,7 @@ void TreeProbability::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_ size_t n_left = num_samples_node - n_right; // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (min_bucket->size() == 1 && (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0])) { continue; } @@ -786,6 +866,21 @@ void TreeProbability::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_ sum_right += (*class_weights)[j] * class_count_right * class_count_right; sum_left += (*class_weights)[j] * class_count_left * class_count_left; } + + // Stop if class-wise minimal bucket size reached + if (min_bucket->size() > 1) { + bool stop = false; + for (size_t j = 0; j < num_classes; ++j) { + size_t class_count_left = class_counts[j] - class_counts_right[j]; + if (class_count_left < (*min_bucket)[j] || class_counts_right[j] < (*min_bucket)[j]) { + stop = true; + break; + } + } + if (stop) { + continue; + } + } // Decrease of impurity double decrease = sum_left / (double) n_left + sum_right / (double) n_right; diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index c272695be..ec59528fb 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -68,7 +68,7 @@ bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possi } // Stop if maximum node size or depth reached - if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { + if (num_samples_node <= (*min_node_size)[0] || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { split_values[nodeID] = estimate(nodeID); return true; } @@ -150,7 +150,7 @@ bool TreeRegression::findBestSplit(size_t nodeID, std::vector& possible_ } // Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + if (num_samples_node >= 2 * (*min_bucket)[0]) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -261,7 +261,7 @@ void TreeRegression::findBestSplitValueSmallQ(size_t nodeID, size_t varID, doubl } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0]) { continue; } @@ -323,7 +323,7 @@ void TreeRegression::findBestSplitValueLargeQ(size_t nodeID, size_t varID, doubl } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0]) { continue; } @@ -405,7 +405,7 @@ void TreeRegression::findBestSplitValueUnordered(size_t nodeID, size_t varID, do size_t n_left = num_samples_node - n_right; // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0]) { continue; } @@ -548,7 +548,7 @@ bool TreeRegression::findBestSplitExtraTrees(size_t nodeID, std::vector& } // Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + if (num_samples_node >= 2 * (*min_bucket)[0]) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -658,7 +658,7 @@ void TreeRegression::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, d } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right[i] < min_bucket) { + if (n_left < (*min_bucket)[0] || n_right[i] < (*min_bucket)[0]) { continue; } @@ -758,7 +758,7 @@ void TreeRegression::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t size_t n_left = num_samples_node - n_right; // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right < min_bucket) { + if (n_left < (*min_bucket)[0] || n_right < (*min_bucket)[0]) { continue; } @@ -793,7 +793,7 @@ bool TreeRegression::findBestSplitBeta(size_t nodeID, std::vector& possi } // Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + if (num_samples_node >= 2 * (*min_bucket)[0]) { // For all possible split variables find best split value for (auto& varID : possible_split_varIDs) { @@ -886,7 +886,7 @@ void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double } // Stop if minimal bucket size reached - if (n_left < min_bucket || n_right[i] < min_bucket) { + if (n_left < (*min_bucket)[0] || n_right[i] < (*min_bucket)[0]) { continue; } diff --git a/src/TreeSurvival.cpp b/src/TreeSurvival.cpp index 1c60ba8bf..7ae947681 100644 --- a/src/TreeSurvival.cpp +++ b/src/TreeSurvival.cpp @@ -140,7 +140,7 @@ bool TreeSurvival::findBestSplit(size_t nodeID, std::vector& possible_sp } // Stop if maximum node size or depth reached - if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { + if (num_samples_node <= (*min_node_size)[0] || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { if (!save_node_stats) { computeSurvival(nodeID); } @@ -148,7 +148,7 @@ bool TreeSurvival::findBestSplit(size_t nodeID, std::vector& possible_sp } // Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + if (num_samples_node >= 2 * (*min_bucket)[0]) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -200,7 +200,7 @@ bool TreeSurvival::findBestSplitMaxstat(size_t nodeID, std::vector& poss size_t num_samples_node = end_pos[nodeID] - start_pos[nodeID]; // Stop if maximum node size or depth reached - if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { + if (num_samples_node <= (*min_node_size)[0] || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { if (!save_node_stats) { computeDeathCounts(nodeID); computeSurvival(nodeID); @@ -414,7 +414,7 @@ void TreeSurvival::findBestSplitValueLogRank(size_t nodeID, size_t varID, double // Stop if minimal bucket size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child[i]; - if (num_samples_right_child[i] < min_bucket || num_samples_left_child < min_bucket) { + if (num_samples_right_child[i] < (*min_bucket)[0] || num_samples_left_child < (*min_bucket)[0]) { continue; } @@ -520,7 +520,7 @@ void TreeSurvival::findBestSplitValueLogRankUnordered(size_t nodeID, size_t varI // Stop if minimal bucket size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child; - if (num_samples_right_child < min_bucket || num_samples_left_child < min_bucket) { + if (num_samples_right_child < (*min_bucket)[0] || num_samples_left_child < (*min_bucket)[0]) { continue; } @@ -611,7 +611,7 @@ void TreeSurvival::findBestSplitValueAUC(size_t nodeID, size_t varID, double& be for (size_t i = 0; i < num_splits; ++i) { // Do not consider this split point if fewer than min_bucket samples in one node size_t num_samples_right_child = num_node_samples - num_samples_left_child[i]; - if (num_samples_left_child[i] < min_bucket || num_samples_right_child < min_bucket) { + if (num_samples_left_child[i] < (*min_bucket)[0] || num_samples_right_child < (*min_bucket)[0]) { continue; } else { double auc = fabs((num_count[i] / 2) / num_total[i] - 0.5); @@ -711,7 +711,7 @@ bool TreeSurvival::findBestSplitExtraTrees(size_t nodeID, std::vector& p } // Stop if maximum node size or depth reached - if (num_samples_node <= min_node_size || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { + if (num_samples_node <= (*min_node_size)[0] || (nodeID >= last_left_nodeID && max_depth > 0 && depth >= max_depth)) { if (!save_node_stats) { computeSurvival(nodeID); } @@ -719,7 +719,7 @@ bool TreeSurvival::findBestSplitExtraTrees(size_t nodeID, std::vector& p } // Stop early if no split posssible - if (num_samples_node >= 2 * min_bucket) { + if (num_samples_node >= 2 * (*min_bucket)[0]) { // For all possible split variables for (auto& varID : possible_split_varIDs) { @@ -805,7 +805,7 @@ void TreeSurvival::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, dou // Stop if minimal node size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child[i]; - if (num_samples_right_child[i] < min_bucket || num_samples_left_child < min_bucket) { + if (num_samples_right_child[i] < (*min_bucket)[0] || num_samples_left_child < (*min_bucket)[0]) { continue; } @@ -934,7 +934,7 @@ void TreeSurvival::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t v // Stop if minimal node size reached size_t num_samples_left_child = num_samples_node - num_samples_right_child; - if (num_samples_right_child < min_bucket || num_samples_left_child < min_bucket) { + if (num_samples_right_child < (*min_bucket)[0] || num_samples_left_child < (*min_bucket)[0]) { continue; } diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index c8c4fed21..66e34c0e5 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -50,7 +50,7 @@ using namespace ranger; // [[Rcpp::export]] Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, - bool write_forest, uint importance_mode_r, uint min_node_size, uint min_bucket, + bool write_forest, uint importance_mode_r, std::vector& min_node_size, std::vector& min_bucket, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, diff --git a/tests/testthat/test_ranger.R b/tests/testthat/test_ranger.R index 0f2c0118a..40c788c0f 100644 --- a/tests/testthat/test_ranger.R +++ b/tests/testthat/test_ranger.R @@ -86,6 +86,14 @@ test_that("Inbag counts match sample fraction, classification", { expect_equal(unique(colSums(inbag[dat$Species == "setosa", ])), 15) expect_equal(unique(colSums(inbag[dat$Species == "versicolor", ])), 30) expect_equal(unique(colSums(inbag[dat$Species == "virginica", ])), 45) + + ## No factor outcome + rf <- ranger(Species ~ ., data.matrix(iris), num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), + replace = TRUE, keep.inbag = TRUE, classification = TRUE) + inbag <- do.call(cbind, rf$inbag.counts) + expect_equal(unique(colSums(inbag[iris$Species == "setosa", ])), 30) + expect_equal(unique(colSums(inbag[iris$Species == "versicolor", ])), 45) + expect_equal(unique(colSums(inbag[iris$Species == "virginica", ])), 60) }) test_that("Inbag counts match sample fraction, probability", { @@ -104,6 +112,14 @@ test_that("Inbag counts match sample fraction, probability", { expect_equal(unique(colSums(inbag[1:50, ])), 15) expect_equal(unique(colSums(inbag[51:100, ])), 30) expect_equal(unique(colSums(inbag[101:150, ])), 45) + + ## No factor outcome + rf <- ranger(Species ~ ., data.matrix(iris), num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), + replace = TRUE, keep.inbag = TRUE, probability = TRUE) + inbag <- do.call(cbind, rf$inbag.counts) + expect_equal(unique(colSums(inbag[1:50, ])), 30) + expect_equal(unique(colSums(inbag[51:100, ])), 45) + expect_equal(unique(colSums(inbag[101:150, ])), 60) }) test_that("as.factor() in formula works", { @@ -382,3 +398,85 @@ test_that("min.bucket creates nodes of correct size", { })) expect_gte(smallest_node, min.bucket) }) + +test_that("Vector min.bucket creates nodes of correct size", { + + # Size 2,3,4 + rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, + min.bucket = c(2, 3, 4), keep.inbag = TRUE) + pred <- predict(rf, iris, type = "terminalNodes")$prediction + inbag <- sapply(rf$inbag.counts, function(x) x == 1) + + smallest_nodes <- sapply(1:ncol(pred), function(i) { + pred1 <- pred[which(inbag[, i][1:50]), i] + pred2 <- pred[which(inbag[, i][51:100]) + 50, i] + pred3 <- pred[which(inbag[, i][101:150]) + 100, i] + + pred <- rbind(data.frame(class = 1, node = pred1), + data.frame(class = 2, node = pred2), + data.frame(class = 3, node = pred3)) + apply(table(pred), 1, min) + }) + + expect_true(all(smallest_nodes >= matrix(c(2, 3, 4), ncol = 5, nrow = 3))) + + # Size 4,3,2 + rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, + min.bucket = c(4, 3, 2), keep.inbag = TRUE) + pred <- predict(rf, iris, type = "terminalNodes")$prediction + inbag <- sapply(rf$inbag.counts, function(x) x == 1) + + smallest_nodes <- sapply(1:ncol(pred), function(i) { + pred1 <- pred[which(inbag[, i][1:50]), i] + pred2 <- pred[which(inbag[, i][51:100]) + 50, i] + pred3 <- pred[which(inbag[, i][101:150]) + 100, i] + + pred <- rbind(data.frame(class = 1, node = pred1), + data.frame(class = 2, node = pred2), + data.frame(class = 3, node = pred3)) + apply(table(pred), 1, min) + }) + + expect_true(all(smallest_nodes >= matrix(c(4, 3, 2), ncol = 5, nrow = 3))) + + # Random size + min.bucket <- round(runif(3, 1, 10)) + rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, + min.bucket = min.bucket, keep.inbag = TRUE) + pred <- predict(rf, iris, type = "terminalNodes")$prediction + inbag <- sapply(rf$inbag.counts, function(x) x == 1) + + smallest_nodes <- sapply(1:ncol(pred), function(i) { + pred1 <- pred[which(inbag[, i][1:50]), i] + pred2 <- pred[which(inbag[, i][51:100]) + 50, i] + pred3 <- pred[which(inbag[, i][101:150]) + 100, i] + + pred <- rbind(data.frame(class = 1, node = pred1), + data.frame(class = 2, node = pred2), + data.frame(class = 3, node = pred3)) + apply(table(pred), 1, min) + }) + + expect_true(all(smallest_nodes >= matrix(min.bucket, ncol = 5, nrow = 3))) + + # No factor outcome + rf <- ranger(Species ~ ., data.matrix(iris), num.trees = 5, replace = FALSE, + min.bucket = c(2, 3, 4), keep.inbag = TRUE, classification = TRUE) + pred <- predict(rf, iris, type = "terminalNodes")$prediction + inbag <- sapply(rf$inbag.counts, function(x) x == 1) + + smallest_nodes <- sapply(1:ncol(pred), function(i) { + pred1 <- pred[which(inbag[, i][1:50]), i] + pred2 <- pred[which(inbag[, i][51:100]) + 50, i] + pred3 <- pred[which(inbag[, i][101:150]) + 100, i] + + pred <- rbind(data.frame(class = 1, node = pred1), + data.frame(class = 2, node = pred2), + data.frame(class = 3, node = pred3)) + apply(table(pred), 1, min) + }) + + expect_true(all(smallest_nodes >= matrix(c(2, 3, 4), ncol = 5, nrow = 3))) +}) + + From d5b3e66f8d3b049b2a0274d3630ab004141e4771 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Thu, 16 May 2024 08:18:45 +0200 Subject: [PATCH 109/111] early stopping for vector bucket size --- src/TreeClassification.cpp | 86 +++++++++++++++++++++++-------------- src/TreeProbability.cpp | 88 +++++++++++++++++++++++--------------- 2 files changed, 107 insertions(+), 67 deletions(-) diff --git a/src/TreeClassification.cpp b/src/TreeClassification.cpp index 28ca371fb..a21297776 100644 --- a/src/TreeClassification.cpp +++ b/src/TreeClassification.cpp @@ -175,35 +175,45 @@ bool TreeClassification::findBestSplit(size_t nodeID, std::vector& possi } } } - - // TODO: Possible to stop early for class-wise min_bucket? + // Stop early if no split posssible - if (min_bucket->size() > 1 || (num_samples_node >= 2 * (*min_bucket)[0])) { + if (min_bucket->size() == 1) { + if (num_samples_node < 2 * (*min_bucket)[0]) { + return true; + } + } else { + uint sum_min_bucket = 0; + for (size_t j = 0; j < num_classes; ++j) { + sum_min_bucket += (*min_bucket)[j]; + } + if (num_samples_node < sum_min_bucket) { + return true; + } + } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { + // For all possible split variables + for (auto& varID : possible_split_varIDs) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { - // Use memory saving method if option set - if (memory_saving_splitting) { + // Use memory saving method if option set + if (memory_saving_splitting) { + findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } else { + // Use faster method for both cases + double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); + if (q < Q_THRESHOLD) { findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { - // Use faster method for both cases - double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); - if (q < Q_THRESHOLD) { - findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } else { - findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } + findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); } - } else { - findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); } + } else { + findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); } } @@ -609,20 +619,30 @@ bool TreeClassification::findBestSplitExtraTrees(size_t nodeID, std::vectorsize() > 1 || (num_samples_node >= 2 * (*min_bucket)[0])) { + if (min_bucket->size() == 1) { + if (num_samples_node < 2 * (*min_bucket)[0]) { + return true; + } + } else { + uint sum_min_bucket = 0; + for (size_t j = 0; j < num_classes; ++j) { + sum_min_bucket += (*min_bucket)[j]; + } + if (num_samples_node < sum_min_bucket) { + return true; + } + } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { - findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } else { - findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, - best_varID, best_decrease); - } + // For all possible split variables + for (auto& varID : possible_split_varIDs) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { + findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } else { + findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, + best_varID, best_decrease); } } diff --git a/src/TreeProbability.cpp b/src/TreeProbability.cpp index 5cb61f467..c92a57137 100644 --- a/src/TreeProbability.cpp +++ b/src/TreeProbability.cpp @@ -179,35 +179,45 @@ bool TreeProbability::findBestSplit(size_t nodeID, std::vector& possible } } } - - // TODO: Possible to stop early for class-wise min_bucket? + // Stop early if no split posssible - if (min_bucket->size() > 1 || (num_samples_node >= 2 * (*min_bucket)[0])) { + if (min_bucket->size() == 1) { + if (num_samples_node < 2 * (*min_bucket)[0]) { + return true; + } + } else { + uint sum_min_bucket = 0; + for (size_t j = 0; j < num_classes; ++j) { + sum_min_bucket += (*min_bucket)[j]; + } + if (num_samples_node < sum_min_bucket) { + return true; + } + } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { + // For all possible split variables + for (auto& varID : possible_split_varIDs) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { - // Use memory saving method if option set - if (memory_saving_splitting) { + // Use memory saving method if option set + if (memory_saving_splitting) { + findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } else { + // Use faster method for both cases + double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); + if (q < Q_THRESHOLD) { findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, best_decrease); } else { - // Use faster method for both cases - double q = (double) num_samples_node / (double) data->getNumUniqueDataValues(varID); - if (q < Q_THRESHOLD) { - findBestSplitValueSmallQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } else { - findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } + findBestSplitValueLargeQ(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); } - } else { - findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); } + } else { + findBestSplitValueUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); } } @@ -612,21 +622,31 @@ bool TreeProbability::findBestSplitExtraTrees(size_t nodeID, std::vector } } } - - // TODO: Possible to stop early for class-wise min_bucket? + // Stop early if no split posssible - if (min_bucket->size() > 1 || (num_samples_node >= 2 * (*min_bucket)[0])) { + if (min_bucket->size() == 1) { + if (num_samples_node < 2 * (*min_bucket)[0]) { + return true; + } + } else { + uint sum_min_bucket = 0; + for (size_t j = 0; j < num_classes; ++j) { + sum_min_bucket += (*min_bucket)[j]; + } + if (num_samples_node < sum_min_bucket) { + return true; + } + } - // For all possible split variables - for (auto& varID : possible_split_varIDs) { - // Find best split value, if ordered consider all values as split values, else all 2-partitions - if (data->isOrderedVariable(varID)) { - findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, - best_decrease); - } else { - findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, - best_varID, best_decrease); - } + // For all possible split variables + for (auto& varID : possible_split_varIDs) { + // Find best split value, if ordered consider all values as split values, else all 2-partitions + if (data->isOrderedVariable(varID)) { + findBestSplitValueExtraTrees(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, best_varID, + best_decrease); + } else { + findBestSplitValueExtraTreesUnordered(nodeID, varID, num_classes, class_counts, num_samples_node, best_value, + best_varID, best_decrease); } } From 05dafd958c5aabb66fe6e53a1d795ab38534b329 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 16 May 2024 06:28:32 +0000 Subject: [PATCH 110/111] Bump JamesIves/github-pages-deploy-action from 4.4.1 to 4.6.0 Bumps [JamesIves/github-pages-deploy-action](https://github.com/jamesives/github-pages-deploy-action) from 4.4.1 to 4.6.0. - [Release notes](https://github.com/jamesives/github-pages-deploy-action/releases) - [Commits](https://github.com/jamesives/github-pages-deploy-action/compare/v4.4.1...v4.6.0) --- updated-dependencies: - dependency-name: JamesIves/github-pages-deploy-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/pkgdown.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index ed7650c73..44c27eadb 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -41,7 +41,7 @@ jobs: - name: Deploy to GitHub pages 🚀 if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@v4.4.1 + uses: JamesIves/github-pages-deploy-action@v4.6.0 with: clean: false branch: gh-pages From 57d11af04103359f8d7a37aa0d4123be132478eb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 16 May 2024 06:28:36 +0000 Subject: [PATCH 111/111] Bump actions/checkout from 2 to 4 Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/R-CMD-check.yaml | 2 +- .github/workflows/cpp-build.yaml | 4 ++-- .github/workflows/pkgdown.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 1d19e9544..de55301f5 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -29,7 +29,7 @@ jobs: R_KEEP_PKG_SOURCE: yes steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 diff --git a/.github/workflows/cpp-build.yaml b/.github/workflows/cpp-build.yaml index 5a0962f4f..b6916d062 100644 --- a/.github/workflows/cpp-build.yaml +++ b/.github/workflows/cpp-build.yaml @@ -11,7 +11,7 @@ jobs: linux: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Build run: | sudo apt-get install cmake @@ -21,7 +21,7 @@ jobs: macos: runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Build run: | mkdir build && pushd build diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index ed7650c73..57aba3979 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -22,7 +22,7 @@ jobs: permissions: contents: write steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2