From 56740088059f88f13293bbd94c756be928a193c8 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Mon, 13 Nov 2023 22:29:02 -0800
Subject: [PATCH] New sample_int_linter (#2274)

* New sample_int_linter

* add metadata regression test
---
 DESCRIPTION                             |  1 +
 NAMESPACE                               |  1 +
 NEWS.md                                 |  1 +
 R/sample_int_linter.R                   | 60 ++++++++++++++++
 inst/lintr/linters.csv                  |  1 +
 man/efficiency_linters.Rd               |  1 +
 man/linters.Rd                          |  7 +-
 man/readability_linters.Rd              |  1 +
 man/robustness_linters.Rd               |  1 +
 man/sample_int_linter.Rd                | 18 +++++
 tests/testthat/test-sample_int_linter.R | 95 +++++++++++++++++++++++++
 11 files changed, 184 insertions(+), 3 deletions(-)
 create mode 100644 R/sample_int_linter.R
 create mode 100644 man/sample_int_linter.Rd
 create mode 100644 tests/testthat/test-sample_int_linter.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 1f7264286..31871bd90 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -156,6 +156,7 @@ Collate:
     'regex_subset_linter.R'
     'repeat_linter.R'
     'routine_registration_linter.R'
+    'sample_int_linter.R'
     'scalar_in_linter.R'
     'semicolon_linter.R'
     'seq_linter.R'
diff --git a/NAMESPACE b/NAMESPACE
index 9da6d5205..a6c17992e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -119,6 +119,7 @@ export(redundant_ifelse_linter)
 export(regex_subset_linter)
 export(repeat_linter)
 export(routine_registration_linter)
+export(sample_int_linter)
 export(sarif_output)
 export(scalar_in_linter)
 export(semicolon_linter)
diff --git a/NEWS.md b/NEWS.md
index 4f53beb0e..fb391d3fd 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -6,6 +6,7 @@
 
 ### New linters
 
+* `sample_int_linter()` for encouraging `sample.int(n, ...)` over equivalents like `sample(1:n, ...)` (part of #884, @MichaelChirico).
 * `stopifnot_all_linter()` discourages tests with `all()` like `stopifnot(all(x > 0))`; `stopifnot()` runs `all()` itself, and uses a better error message (part of #884, @MichaelChirico).
 * `comparison_negation_linter()` for discouraging negated comparisons when a direct negation is preferable, e.g. `!(x == y)` could be `x != y` (part of #884, @MichaelChirico).
 * `terminal_close_linter()` for discouraging using `close()` to end functions (part of #884, @MichaelChirico). Such usages are not robust to errors, where `close()` will not be run as intended. Put `close()` in an `on.exit()` hook, or use {withr} to manage connections with proper cleanup.
diff --git a/R/sample_int_linter.R b/R/sample_int_linter.R
new file mode 100644
index 000000000..a9da00f6f
--- /dev/null
+++ b/R/sample_int_linter.R
@@ -0,0 +1,60 @@
+#' Require usage of sample.int(n, m, ...) over sample(1:n, m, ...)
+#'
+#' [sample.int()] is preferable to `sample()` for the case of sampling numbers
+#'   between 1 and `n`. `sample` calls `sample.int()` "under the hood".
+#'
+#' @evalRd rd_tags("sample_int_linter")
+#' @seealso [linters] for a complete list of linters available in lintr.
+#' @export
+sample_int_linter <- function() {
+  # looking for anything like sample(1: that doesn't come after a $ extraction
+  # exclude TRUE/FALSE for sample(replace = TRUE, ...) usage. better
+  #   would be match.arg() but this also works.
+  xpath <- glue("
+  //SYMBOL_FUNCTION_CALL[text() = 'sample']
+    /parent::expr[not(OP-DOLLAR or OP-AT)]
+    /following-sibling::expr[1][
+      (
+        expr[1]/NUM_CONST[text() = '1' or text() = '1L']
+        and OP-COLON
+      )
+      or expr/SYMBOL_FUNCTION_CALL[text() = 'seq_len']
+      or (
+        expr/SYMBOL_FUNCTION_CALL[text() = 'seq']
+        and (
+          count(expr) = 2
+          or (
+            expr[2]/NUM_CONST[text() = '1' or text() = '1L']
+            and not(SYMBOL_SUB[
+              text() = 'by'
+              and not(following-sibling::expr[1]/NUM_CONST[text() = '1' or text() = '1L'])
+            ])
+          )
+        )
+      )
+      or NUM_CONST[not(text() = 'TRUE' or text() = 'FALSE')]
+    ]
+    /parent::expr
+  ")
+
+  Linter(function(source_expression) {
+    if (!is_lint_level(source_expression, "expression")) {
+      return(list())
+    }
+
+    xml <- source_expression$xml_parsed_content
+
+    bad_expr <- xml_find_all(xml, xpath)
+    first_call <- xp_call_name(bad_expr, depth = 2L)
+    original <- sprintf("%s(n)", first_call)
+    original[!is.na(xml_find_first(bad_expr, "expr[2]/OP-COLON"))] <- "1:n"
+    original[!is.na(xml_find_first(bad_expr, "expr[2]/NUM_CONST"))] <- "n"
+
+    xml_nodes_to_lints(
+      bad_expr,
+      source_expression = source_expression,
+      lint_message = glue("sample.int(n, m, ...) is preferable to sample({original}, m, ...)."),
+      type = "warning"
+    )
+  })
+}
diff --git a/inst/lintr/linters.csv b/inst/lintr/linters.csv
index 275c1863d..c6ca4ca94 100644
--- a/inst/lintr/linters.csv
+++ b/inst/lintr/linters.csv
@@ -76,6 +76,7 @@ redundant_ifelse_linter,best_practices efficiency consistency configurable
 regex_subset_linter,best_practices efficiency
 repeat_linter,style readability
 routine_registration_linter,best_practices efficiency robustness
+sample_int_linter,efficiency readability robustness
 scalar_in_linter,readability consistency best_practices efficiency
 semicolon_linter,style readability default configurable
 semicolon_terminator_linter,style readability deprecated configurable
diff --git a/man/efficiency_linters.Rd b/man/efficiency_linters.Rd
index 7122f90e8..146c0be54 100644
--- a/man/efficiency_linters.Rd
+++ b/man/efficiency_linters.Rd
@@ -28,6 +28,7 @@ The following linters are tagged with 'efficiency':
 \item{\code{\link{redundant_ifelse_linter}}}
 \item{\code{\link{regex_subset_linter}}}
 \item{\code{\link{routine_registration_linter}}}
+\item{\code{\link{sample_int_linter}}}
 \item{\code{\link{scalar_in_linter}}}
 \item{\code{\link{seq_linter}}}
 \item{\code{\link{sort_linter}}}
diff --git a/man/linters.Rd b/man/linters.Rd
index c960b5d00..ef71af792 100644
--- a/man/linters.Rd
+++ b/man/linters.Rd
@@ -24,12 +24,12 @@ The following tags exist:
 \item{\link[=correctness_linters]{correctness} (7 linters)}
 \item{\link[=default_linters]{default} (25 linters)}
 \item{\link[=deprecated_linters]{deprecated} (8 linters)}
-\item{\link[=efficiency_linters]{efficiency} (25 linters)}
+\item{\link[=efficiency_linters]{efficiency} (26 linters)}
 \item{\link[=executing_linters]{executing} (5 linters)}
 \item{\link[=package_development_linters]{package_development} (14 linters)}
 \item{\link[=pkg_testthat_linters]{pkg_testthat} (12 linters)}
-\item{\link[=readability_linters]{readability} (56 linters)}
-\item{\link[=robustness_linters]{robustness} (15 linters)}
+\item{\link[=readability_linters]{readability} (57 linters)}
+\item{\link[=robustness_linters]{robustness} (16 linters)}
 \item{\link[=style_linters]{style} (38 linters)}
 }
 }
@@ -108,6 +108,7 @@ The following linters exist:
 \item{\code{\link{regex_subset_linter}} (tags: best_practices, efficiency)}
 \item{\code{\link{repeat_linter}} (tags: readability, style)}
 \item{\code{\link{routine_registration_linter}} (tags: best_practices, efficiency, robustness)}
+\item{\code{\link{sample_int_linter}} (tags: efficiency, readability, robustness)}
 \item{\code{\link{scalar_in_linter}} (tags: best_practices, consistency, efficiency, readability)}
 \item{\code{\link{semicolon_linter}} (tags: configurable, default, readability, style)}
 \item{\code{\link{seq_linter}} (tags: best_practices, consistency, default, efficiency, robustness)}
diff --git a/man/readability_linters.Rd b/man/readability_linters.Rd
index 7bb1a809f..06deb9233 100644
--- a/man/readability_linters.Rd
+++ b/man/readability_linters.Rd
@@ -53,6 +53,7 @@ The following linters are tagged with 'readability':
 \item{\code{\link{quotes_linter}}}
 \item{\code{\link{redundant_equals_linter}}}
 \item{\code{\link{repeat_linter}}}
+\item{\code{\link{sample_int_linter}}}
 \item{\code{\link{scalar_in_linter}}}
 \item{\code{\link{semicolon_linter}}}
 \item{\code{\link{sort_linter}}}
diff --git a/man/robustness_linters.Rd b/man/robustness_linters.Rd
index 499965308..18f12ca61 100644
--- a/man/robustness_linters.Rd
+++ b/man/robustness_linters.Rd
@@ -21,6 +21,7 @@ The following linters are tagged with 'robustness':
 \item{\code{\link{namespace_linter}}}
 \item{\code{\link{nonportable_path_linter}}}
 \item{\code{\link{routine_registration_linter}}}
+\item{\code{\link{sample_int_linter}}}
 \item{\code{\link{seq_linter}}}
 \item{\code{\link{strings_as_factors_linter}}}
 \item{\code{\link{T_and_F_symbol_linter}}}
diff --git a/man/sample_int_linter.Rd b/man/sample_int_linter.Rd
new file mode 100644
index 000000000..ed4c7a7fc
--- /dev/null
+++ b/man/sample_int_linter.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sample_int_linter.R
+\name{sample_int_linter}
+\alias{sample_int_linter}
+\title{Require usage of sample.int(n, m, ...) over sample(1:n, m, ...)}
+\usage{
+sample_int_linter()
+}
+\description{
+\code{\link[=sample.int]{sample.int()}} is preferable to \code{sample()} for the case of sampling numbers
+between 1 and \code{n}. \code{sample} calls \code{sample.int()} "under the hood".
+}
+\seealso{
+\link{linters} for a complete list of linters available in lintr.
+}
+\section{Tags}{
+\link[=efficiency_linters]{efficiency}, \link[=readability_linters]{readability}, \link[=robustness_linters]{robustness}
+}
diff --git a/tests/testthat/test-sample_int_linter.R b/tests/testthat/test-sample_int_linter.R
new file mode 100644
index 000000000..bf217cbee
--- /dev/null
+++ b/tests/testthat/test-sample_int_linter.R
@@ -0,0 +1,95 @@
+test_that("sample_int_linter skips allowed usages", {
+  linter <- sample_int_linter()
+
+  expect_lint("sample(n, m)", NULL, linter)
+  expect_lint("sample(n, m, TRUE)", NULL, linter)
+  expect_lint("sample(n, m, prob = 1:n/n)", NULL, linter)
+  expect_lint("sample(foo(x), m, TRUE)", NULL, linter)
+  expect_lint("sample(n, replace = TRUE)", NULL, linter)
+
+  expect_lint("sample(10:1, m)", NULL, linter)
+})
+
+test_that("sample_int_linter blocks simple disallowed usages", {
+  linter <- sample_int_linter()
+  lint_msg <- rex::rex("sample.int(n, m, ...) is preferable to sample(1:n, m, ...).")
+
+  expect_lint("sample(1:10, 2)", lint_msg, linter)
+  # also matches literal integer
+  expect_lint("sample(1L:10L, 2)", lint_msg, linter)
+  expect_lint("sample(1:n, 2)", lint_msg, linter)
+  expect_lint("sample(1:k, replace = TRUE)", lint_msg, linter)
+  expect_lint("sample(1:foo(x), prob = bar(x))", lint_msg, linter)
+})
+
+test_that("sample_int_linter blocks sample(seq_len(n), ...) as well", {
+  expect_lint(
+    "sample(seq_len(10), 2)",
+    rex::rex("sample.int(n, m, ...) is preferable to sample(seq_len(n), m, ...)."),
+    sample_int_linter()
+  )
+})
+
+test_that("sample_int_linter blocks sample(seq(n)) and sample(seq(1, ...))", {
+  linter <- sample_int_linter()
+  lint_msg <- rex::rex("sample.int(n, m, ...) is preferable to sample(seq(n), m, ...).")
+
+  expect_lint("sample(seq(n), 5)", lint_msg, linter)
+  expect_lint("sample(seq(1, 10), 5)", lint_msg, linter)
+  expect_lint("sample(seq(1, 10, by = 1), 5)", lint_msg, linter)
+  expect_lint("sample(seq(1L, 10, by = 1L), 5)", lint_msg, linter)
+
+  # lint doesn't apply when by= is used (except when set to literal 1)
+  expect_lint("sample(seq(1, 10, by = 2), 5)", NULL, linter)
+  expect_lint("sample(seq(1, 10, by = n), 5)", NULL, linter)
+})
+
+test_that("sample_int_linter catches literal integer/numeric in the first arg", {
+  linter <- sample_int_linter()
+  lint_msg <- rex::rex("sample.int(n, m, ...) is preferable to sample(n, m, ...).")
+
+  expect_lint("sample(10L, 4)", lint_msg, linter)
+  expect_lint("sample(10, 5)", lint_msg, linter)
+})
+
+test_that("sample_int_linter skips TRUE or FALSE in the first argument", {
+  linter <- sample_int_linter()
+
+  expect_lint("sample(replace = TRUE, letters)", NULL, linter)
+  expect_lint("sample(replace = FALSE, letters)", NULL, linter)
+})
+
+test_that("sample_int_linter skips x$sample() usage", {
+  linter <- sample_int_linter()
+  lint_msg <- rex::rex("sample.int(n, m, ...) is preferable to sample(n, m, ...).")
+
+  expect_lint("foo$sample(1L)", NULL, linter)
+  expect_lint("foo$sample(1:10)", NULL, linter)
+  expect_lint("foo$sample(seq_len(10L))", NULL, linter)
+  # ditto for '@' slot extraction
+  expect_lint("foo@sample(1L)", NULL, linter)
+
+  # however, base::sample qualification is still caught
+  expect_lint("base::sample(10L)", lint_msg, linter)
+
+  # but also, not everything "below" a $ extraction is skipped
+  expect_lint("foo$bar(sample(10L))", lint_msg, linter)
+})
+
+test_that("multiple lints are generated correctly", {
+  expect_lint(
+    trim_some("{
+      sample(1:10, 2)
+      sample(10, 2)
+      sample(seq_len(10), 2)
+      sample(seq(10), 2)
+    }"),
+    list(
+      list(rex::rex("sample(1:n"), line_number = 2L, column_number = 3L),
+      list(rex::rex("sample(n"), line_number = 3L, column_number = 3L),
+      list(rex::rex("sample(seq_len(n)"), line_number = 4L, column_number = 3L),
+      list(rex::rex("sample(seq(n)"), line_number = 5L, column_number = 3L)
+    ),
+    sample_int_linter()
+  )
+})