Skip to content

Commit

Permalink
Merge branch 'master' into setdt-wide
Browse files Browse the repository at this point in the history
  • Loading branch information
tdhock authored Apr 24, 2024
2 parents 1872f47 + 29a0f13 commit 1e758e6
Show file tree
Hide file tree
Showing 93 changed files with 1,948 additions and 847 deletions.
2 changes: 1 addition & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.dir-locals.el
^\.Rprofile$
^data\.table_.*\.tar\.gz$
^config\.log$
^vignettes/plots/figures$
^\.Renviron$
^[^/]+\.R$
Expand All @@ -16,7 +17,6 @@
^\.graphics$
^\.github$

^\.appveyor\.yml$
^\.gitlab-ci\.yml$

^Makefile$
Expand Down
71 changes: 0 additions & 71 deletions .appveyor.yml

This file was deleted.

105 changes: 105 additions & 0 deletions .ci/.lintr.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
for (f in list.files('ci/linters', full.names=TRUE)) source(f)
rm(f)

linters = all_linters(
packages = "lintr", # TODO(lintr->3.2.0): Remove this.
# eq_assignment_linter(),
brace_linter(allow_single_line = TRUE),
# TODO(michaelchirico): Activate these incrementally. These are the
# parameterizations that match our style guide.
# implicit_assignment_linter(allow_lazy = TRUE, allow_scoped = TRUE),
# implicit_integer_linter(allow_colon = TRUE),
# system_time_linter = undesirable_function_linter(c(
# system.time = "Only run timings in benchmark.Rraw"
# )),
# undesirable_function_linter(modify_defaults(
# default_undesirable_functions,
# ifelse = "Use fifelse instead.",
# Sys.setenv = NULL,
# library = NULL,
# options = NULL,
# par = NULL,
# setwd = NULL
# )),
undesirable_operator_linter(modify_defaults(
default_undesirable_operators,
`<<-` = NULL
)),
# TODO(lintr#2441): Use upstream implementation.
assignment_linter = NULL,
# TODO(lintr#2442): Use this once x[ , j, by] is supported.
commas_linter = NULL,
commented_code_linter = NULL,
# TODO(linter->3.2.0): Activate this.
consecutive_assertion_linter = NULL,
cyclocomp_linter = NULL,
function_argument_linter = NULL,
indentation_linter = NULL,
infix_spaces_linter = NULL,
# TODO(R>3.2.0): Activate this, extending to recognize vapply_1i(x, length).
lengths_linter = NULL,
line_length_linter = NULL,
missing_package_linter = NULL,
namespace_linter = NULL,
nonportable_path_linter = NULL,
object_name_linter = NULL,
object_usage_linter = NULL,
quotes_linter = NULL,
semicolon_linter = NULL,
spaces_inside_linter = NULL,
spaces_left_parentheses_linter = NULL,
# TODO(michaelchirico): Only exclude from vignettes, not sure what's wrong.
strings_as_factors_linter = NULL,
# TODO(lintr->3.2.0): Fix on a valid TODO style, enforce it, and re-activate.
todo_comment_linter = NULL,
# TODO(michaelchirico): Enforce these and re-activate them one-by-one. Also stop using '<<-'.
brace_linter = NULL,
condition_call_linter = NULL,
conjunct_test_linter = NULL,
fixed_regex_linter = NULL,
function_left_parentheses_linter = NULL,
if_not_else_linter = NULL,
implicit_assignment_linter = NULL,
implicit_integer_linter = NULL,
keyword_quote_linter = NULL,
length_levels_linter = NULL,
matrix_apply_linter = NULL,
missing_argument_linter = NULL,
nzchar_linter = NULL,
object_overwrite_linter = NULL,
paren_body_linter = NULL,
redundant_equals_linter = NULL,
rep_len_linter = NULL,
repeat_linter = NULL,
return_linter = NULL,
sample_int_linter = NULL,
scalar_in_linter = NULL,
seq_linter = NULL,
undesirable_function_linter = NULL,
unnecessary_concatenation_linter = NULL,
unnecessary_lambda_linter = NULL,
unnecessary_nesting_linter = NULL,
unreachable_code_linter = NULL,
unused_import_linter = NULL
)
# TODO(lintr#2172): Glob with lintr itself.
exclusions = local({
exclusion_for_dir <- function(dir, exclusions) {
files = list.files(dir, pattern = "\\.(R|Rmd)$")
stats::setNames(rep(list(exclusions), length(files)), files)
}
c(
exclusion_for_dir("tests", list(
quotes_linter = Inf,
# TODO(michaelchirico): Enforce these and re-activate them one-by-one.
implicit_integer_linter = Inf,
infix_spaces_linter = Inf,
undesirable_function_linter = Inf
)),
exclusion_for_dir("vignettes", list(
quotes_linter = Inf
# strings_as_factors_linter = Inf
# system_time_linter = Inf
))
)
})
4 changes: 0 additions & 4 deletions .ci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@ Artifacts:

TODO document

### [Appveyor](./../.appveyor.yml)

TODO document

## CI tools

### [`ci.R`](./ci.R)
Expand Down
126 changes: 126 additions & 0 deletions .ci/atime/tests.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# A function to customize R package metadata and source files to facilitate version-specific installation and testing.
#
# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package.
# It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately.
#
# @param old.Package Current name of the package.
# @param new.Package New name of the package, including a SHA hash.
# @param sha SHA1 hash used for differentiating versions.
# @param new.pkg.path Path to the package files.
#
# @details
# The function modifies:
# - DESCRIPTION, updating the package name.
# - Makevars, customizing the shared object file name and adjusting the build settings.
# - R/onLoad.R, adapting custom version checking for package loading operations.
# - NAMESPACE, changing namespace settings for dynamic linking.
#
# @examples
# pkg.edit.fun("data.table", "data.table.some_SHA1_hash", "some_SHA1_hash", "/path/to/data.table")
#
# @return None (performs in-place file modifications)
# @note This setup is typically unnecessary for most packages but essential for data.table due to its unique configuration.
pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
pkg_find_replace <- function(glob, FIND, REPLACE) {
atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
}
Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
new.Package_ <- paste0(Package_, "_", sha)
pkg_find_replace(
"DESCRIPTION",
paste0("Package:\\s+", old.Package),
paste("Package:", new.Package))
pkg_find_replace(
file.path("src", "Makevars.*in"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
sprintf('packageVersion\\("%s"\\)', old.Package),
sprintf('packageVersion\\("%s"\\)', new.Package))
pkg_find_replace(
file.path("src", "init.c"),
paste0("R_init_", Package_regex),
paste0("R_init_", gsub("[.]", "_", new.Package_)))
pkg_find_replace(
"NAMESPACE",
sprintf('useDynLib\\("?%s"?', Package_regex),
paste0('useDynLib(', new.Package_))
}

# A list of performance tests.
#
# Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments:
# - N: A numeric sequence of data sizes to vary.
# - setup: An expression evaluated for every data size before measuring time/memory.
# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions.
# This must call a function from data.table using a syntax with double or triple colon prefix.
# The package name before the colons will be replaced by a new package name that uses the commit SHA hash.
# (For instance, data.table:::[.data.table will become data.table.some_40_digit_SHA1_hash:::[.data.table)
#
# Optional parameters that may be useful to configure tests:
# - times: Number of times each expression is evaluated (default is 10).
# - seconds.limit: The maximum median timing (in seconds) of an expression. No timings for larger N are computed past that threshold.
# - sha.vec: Named character vector or a list of vectors that specify data.table-specific commit SHAs for testing across those different git commit versions.
# For historical regressions, use 'Before', 'Regression', and 'Fixed' (otherwise something like 'Slow' or 'Fast' ideally).
# @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information.
# nolint start: undesirable_operator_linter. ':::' needed+appropriate here.
test.list <- list(
# Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311
# Fixed in: https://github.com/Rdatatable/data.table/pull/4440
"shallow regression fixed in #4440" = list(
pkg.edit.fun = pkg.edit.fun,
N = 10^seq(3,8),
setup = quote({
set.seed(1L)
dt <- data.table(a = sample.int(N))
setindexv(dt, "a")
}),
expr = quote(data.table:::shallow(dt)),
# Before = "", This needs to be updated later as there are two issues here: A) The source of regression (or the particular commit that led to it) is not clear; B) Older versions of data.table are having problems when being installed in this manner (This includes commits from before March 20 2020, when the issue that discovered or first mentioned the regression was created)
Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression
Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440)

# Test based on: https://github.com/Rdatatable/data.table/issues/5424
# Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491
# Fixed in: https://github.com/Rdatatable/data.table/pull/5463
"memrecycle regression fixed in #5463" = list(
pkg.edit.fun = pkg.edit.fun,
N = 10^seq(3, 8),
setup = quote({
n <- N/100
set.seed(2L)
dt <- data.table(
g = sample(seq_len(n), N, TRUE),
x = runif(N),
key = "g")
dt_mod <- copy(dt)
}),
expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)),
Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits)

# Issue reported in: https://github.com/Rdatatable/data.table/issues/5426
# To be fixed in: https://github.com/Rdatatable/data.table/pull/5427
"setDT improved in #5427" = list(
pkg.edit.fun = pkg.edit.fun,
N = 10^seq(1, 7),
setup = quote({
L <- replicate(N, 1, simplify = FALSE)
setDT(L)
}),
expr = quote({
data.table:::setattr(L, "class", NULL)
data.table:::setDT(L)
}),
Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801)
Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
)
# nolint end: undesirable_operator_linter.
10 changes: 10 additions & 0 deletions .dev/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# data.table developer

## Setup

To use the optional helper function `cc()`, one needs to set up the project path and source `.dev/cc.R` to use `cc()` conveniently. This works through creating an additional `.Rprofile` in the `data.table` directory.

```r
# content of .Rprofile in the package directory
Sys.setenv(PROJ_PATH="~/git/data.table")
source(".dev/cc.R")
```

## Utilities

### [`cc.R`](./cc.R)
Expand Down
24 changes: 23 additions & 1 deletion .dev/cc.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,28 @@ sourceDir = function(path=getwd(), trace = TRUE, ...) {
if(trace) cat("\n")
}

cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) {
# NB: since we only import from default packages, this is rarely needed, but useful for truly minimal dev environments (#6056)
sourceImports = function(path=getwd(), quiet=FALSE) {
nsFile = file.path(path, "NAMESPACE")
if (!file.exists(nsFile)) {
if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly")
return(invisible())
}
suppressWarnings(rm("getRversion", envir=.GlobalEnv)) # clean up from previous cc() because parseNamespaceFile() run getRversion() in NAMESPACE in .GlobalEnv
nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function
if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports)))
for (ii in seq_along(nsParsedImports)) {
entry = nsParsedImports[[ii]]
# getNamespaceExports includes weird objects but that's intentional, consider evalq(.__C__VIRTUAL, asNamespace("Rcpp")) due to import(methods) in that NAMESPACE
imported = if (length(entry) == 1L) getNamespaceExports(entry) else entry[[2L]]
# Assign directly to better imitate actual namespace imports. Earlier tried to require(include.only=) these objects, but R doesn't allow multiple such require, meaning we can't add more objects later in tests, see:
# https://stat.ethz.ch/pipermail/r-devel/2024-April/083319.html
for (import in imported) assign(import, getExportedValue(entry[[1L]], import), .GlobalEnv)
}
return(invisible())
}

cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH", unset="."), CC="gcc", quiet=FALSE) {
if (!missing(cc_dir)) {
warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead")
path = cc_dir
Expand Down Expand Up @@ -81,6 +102,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys
.GlobalEnv[[Call$name]] = Call$address
for (Extern in xx$.External)
.GlobalEnv[[Extern$name]] = Extern$address
sourceImports(path, quiet=quiet)
sourceDir(file.path(path, "R"), trace=!quiet)
if (base::getRversion()<"4.0.0") rm(list=c("rbind.data.table", "cbind.data.table"), envir=.GlobalEnv) # 3968 follow up
.GlobalEnv$testDir = function(x) file.path(path,"inst/tests",x)
Expand Down
Loading

0 comments on commit 1e758e6

Please sign in to comment.