From c27deac4ddd27d2b6bd42d691d641d8a5fae2dcc Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 19 Nov 2024 12:29:46 -0500 Subject: [PATCH] parse_data: Fix compatibility with R 4.4 (#588) * split_on_line_directives: guard against input without a directive get_parse_data extracts lines from the input srcfile object and feeds them to split_on_line_directives, which expects the lines to be a concatenation of all the package R files, separated by #line directives. With how get_parse_data is currently called, that expectation is met. get_parse_data is called only if utils::getParseData returns NULL, and getParseData doesn't return NULL for any of the cases where the input does _not_ have line directives (i.e. entry points other than package_coverage). An upcoming commit is going to move the get_parse_data call in front of the getParseData call, so update split_on_line_directives to detect the "no directives" case. Without this guard, the mapply call in split_on_line_directives would error under an R version before 4.2; with R 4.2 or later, split_on_line_directives returns empty. * split_on_line_directives: fix handling of single-file package case split_on_line_directives breaks the input at #line directives and returns a named list of lines for each file. For a package with a single file under R/, there is one directive. The bounds calculation is still correct for that case. However, the return value is incorrectly a matrix rather than a list because the mapply call simplifies the result. At this point, this bug is mostly [*] unexposed because this code path is only triggered if utils::getParseData returns NULL, and it should always return a non-NULL result for the single-file package case. The next commit will reorder things, exposing the bug. Tell mapply to not simplify the result. [*] The simplification to a matrix could also happen for multi-file packages in the unlikely event that all files have the same number of lines. * parse_data: promote custom parse logic for R 4.4 compatibility utils::getParseData has a longstanding bug: for an installed package, parse data is available only for the last file [1]. To work around that, the get_tokens helper first calls getParseData and then falls back to custom logic that extracts the concatenated source lines, splits them on #line directives, and calls getParseData on each file's lines. The getParseData bug was fixed in R 4.4.0 (r84538). Unfortunately that change causes at least two issues (for some subset of packages): a substantial performance regression [2] and an error when applying exclusions [3]. Under R 4.4, getParseData always returns non-NULL as a result of that change when calculating package coverage (in other words, the get_parse_data fallback is _not_ triggered). The slowdown is partially due to the parse data no longer being cached across get_tokens calls. Another relevant aspect, for both the slowdown and the error applying exclusions, is likely that the new getParseData returns data for the entire package rather than the per-file parse data the downstream covr code expects. One solution would be to adapt covr's caching and handling of the getParseData when running under R 4.4.0 or later. Instead go with a simpler and more minimal fix. Reorder the calls so that the get_parse_data call, which we know has been the primary code path for package coverage before R 4.4.0, is the first call tried. Leave getParseData as the fallback to handle the non-package coverage cases. [1] https://github.com/r-lib/covr/pull/154 https://bugs.r-project.org/show_bug.cgi?id=16756 [2] As an extreme case, calling package_coverage on R.utils goes from under 15 minutes to over 6 hours. [3] nanotime (v0.3.10) and diffobj (v0.3.5) are two examples of packages that hit into this error. Closes #576 Closes #579 Re: #567 --- NEWS.md | 3 +++ R/parse_data.R | 20 +++++++++++--- R/utils.R | 11 +++++++- tests/testthat/test-utils.R | 52 +++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index f7bfb25f..491f452f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # covr (development version) +* Fixed a performance regression and an error triggered by a change in R + 4.4.0. (@kyleam, #588) + * Fixed an issue where attempting to generate code coverage on an already-loaded package could fail on Windows. (@kevinushey, #574) diff --git a/R/parse_data.R b/R/parse_data.R index 612200e2..68d30c58 100644 --- a/R/parse_data.R +++ b/R/parse_data.R @@ -122,7 +122,12 @@ package_parse_data <- new.env() get_parse_data <- function(srcfile) { if (length(package_parse_data) == 0) { lines <- getSrcLines(srcfile, 1L, Inf) - res <- lapply(split_on_line_directives(lines), + lines_split <- split_on_line_directives(lines) + if (!length(lines_split)) { + return(NULL) + } + + res <- lapply(lines_split, function(x) getParseData(parse(text = x, keep.source = TRUE), includeText = TRUE)) for (i in seq_along(res)) { package_parse_data[[names(res)[[i]]]] <- res[[i]] @@ -135,7 +140,16 @@ clean_parse_data <- function() { rm(list = ls(package_parse_data), envir = package_parse_data) } -# Needed to work around https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=16756 get_tokens <- function(srcref) { - getParseData(srcref) %||% get_parse_data(attr(getSrcref(srcref), "srcfile")) + # Before R 4.4.0, covr's custom get_parse_data is necessary because + # utils::getParseData returns parse data for only the last file in the + # package. That issue (bug#16756) is fixed in R 4.4.0 (r84538). + # + # On R 4.4.0, continue to use get_parse_data because covr's code expects the + # result to be limited to the srcref file. getParseData will return parse data + # for all of the package's files. + get_parse_data(attr(getSrcref(srcref), "srcfile")) %||% + # This covers the non-installed file case where the source file isn't a + # concatenated file with "line N" directives. + getParseData(srcref) } diff --git a/R/utils.R b/R/utils.R index ebb151b2..6d01c23e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -90,9 +90,18 @@ split_on_line_directives <- function(lines) { capture(name = "line_number", digit), spaces, quotes, capture(name = "filename", anything), quotes)) directive_lines <- which(!is.na(matches$line_number)) + if (!length(directive_lines)) { + return(NULL) + } + file_starts <- directive_lines + 1 file_ends <- c(directive_lines[-1] - 1, length(lines)) - res <- mapply(function(start, end) lines[start:end], file_starts, file_ends) + res <- mapply( + function(start, end) lines[start:end], + file_starts, + file_ends, + SIMPLIFY = FALSE + ) names(res) <- na.omit(matches$filename) res } diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index f2dc3df5..90dc0a37 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -51,3 +51,55 @@ test_that("per_line removes blank lines and lines with only punctuation (#387)", expect_equal(line_cov[[1]]$coverage, c(NA, 0, 0, 2, NA, 1, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA)) }) + +context("split_on_line_directives") + +test_that("split_on_line_directives returns NULL for input without directive (#588)", { + expect_identical( + split_on_line_directives(NULL), + NULL + ) + expect_identical( + split_on_line_directives(character()), + NULL + ) + expect_identical( + split_on_line_directives("aa"), + NULL + ) + expect_identical( + split_on_line_directives(c("abc", "def")), + NULL + ) +}) + +test_that("split_on_line_directives does not simplify the result (#588)", { + expect_identical( + split_on_line_directives( + c( + '#line 1 "foo.R"', + "abc", + "def" + ) + ), + list( + "foo.R" = c("abc", "def") + ) + ) + expect_identical( + split_on_line_directives( + c( + '#line 1 "foo.R"', + "abc", + "def", + '#line 4 "bar.R"', + "ghi", + "jkl" + ) + ), + list( + "foo.R" = c("abc", "def"), + "bar.R" = c("ghi", "jkl") + ) + ) +})