diff --git a/NAMESPACE b/NAMESPACE index 2497f0cf9..b894d63c1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -206,3 +206,6 @@ S3method(format_list_item, data.frame) export(fdroplevels, setdroplevels) S3method(droplevels, data.table) + +# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html +if (getRversion() >= "4.4.0") S3method(sort_by, data.table) diff --git a/NEWS.md b/NEWS.md index fa384dc10..ca3132c56 100644 --- a/NEWS.md +++ b/NEWS.md @@ -69,6 +69,8 @@ rowwiseDT( 6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`. +7. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR. + ## BUG FIXES 1. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR. diff --git a/R/data.table.R b/R/data.table.R index bac200b8a..36b7426c9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2532,6 +2532,18 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR } } +sort_by.data.table <- function(x, y, ...) +{ + if (!cedta()) return(NextMethod()) # nocov + if (inherits(y, "formula")) + y <- .formula2varlist(y, x) + if (!is.list(y)) + y <- list(y) + # use forder instead of base 'order' + o <- do.call(forder, c(unname(y), list(...))) + x[o, , drop = FALSE] +} + # TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want copy = function(x) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 657478c61..6213b9e39 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -20697,3 +20697,30 @@ if (test_bit64) { test(2300.3, DT1[DT2, on='id'], error="Incompatible join types") test(2300.4, DT2[DT1, on='id'], error="Incompatible join types") } + +# sort_by.data.table +DT1 = data.table(a = c(1, 3, 2, NA, 3) , b = 4:0) +DT2 = data.table(a = c("c", "a", "B")) # data.table uses C-locale and should sort_by if cedta() +DT3 = data.table(a = c(1,2,3), b = list(c("a","b","",NA),c(1,3,2,0), c(T,T,F,NA))) # list column + +# sort_by.data.table: basics +test(2301.01, sort_by(DT1, ~ a + b), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L))) +test(2301.02, sort_by(DT1, ~ I(a + b)), data.table(a = c(3,2,1,3,NA), b = c(0L,2L,4L,3L,1L))) +test(2301.03, sort_by(DT2, ~ a), data.table(a = c("B", "a", "c"))) + +# sort_by.data.table: list columns. +# NOTE 1: .formula2varlist works well with list columns. +# NOTE 2: 4 elem in DT of 3 row because forderv takes a list column as a DT. +test(2301.04, sort_by(DT3, ~b), DT3[order(b)]) # should be consistent. + +# sort_by.data.table: additional C-locale sorting +test(2301.10, DT2[, sort_by(.SD, a)], data.table(a = c("B", "a", "c"))) +test(2301.11, DT2[, sort_by(.SD, ~ a)], data.table(a = c("B", "a", "c"))) + +# sort_by.data.table: various working interfaces +test(2301.20, sort_by(DT1, list(DT1$a, DT1$b)), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L))) +test(2301.21, sort_by(DT1, DT1[, .(a, b)]), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L))) +test(2301.22, DT1[, sort_by(.SD, .(a, b))], data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L))) +test(2301.23, DT1[, sort_by(.SD, ~ a + b)], data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L))) +test(2301.24, DT1[, sort_by(.SD, ~ .(a, b))], data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L))) + diff --git a/man/setorder.Rd b/man/setorder.Rd index e1cdc40bb..36c6e1259 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -5,6 +5,7 @@ \alias{fastorder} \alias{forder} \alias{forderv} +\alias{sort_by} \title{Fast row reordering of a data.table by reference} \description{ @@ -32,6 +33,7 @@ setorderv(x, cols = colnames(x), order=1L, na.last=FALSE) # optimised to use data.table's internal fast order # x[order(., na.last=TRUE)] # x[order(., decreasing=TRUE)] +# sort_by(x, ., na.last=TRUE, decreasing=FALSE) } \arguments{ \item{x}{ A \code{data.table}. } @@ -46,7 +48,7 @@ when \code{b} is of type \code{character} as well. } \code{order} must be either \code{1} or equal to that of \code{cols}. If \code{length(order) == 1}, it is recycled to \code{length(cols)}. } \item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last; if \code{FALSE}, they are placed first; if \code{NA} they are removed. -\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and its +\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and related \code{sort_by(x, .)} and its default is \code{TRUE}. \code{setorder} and \code{setorderv} only accept \code{TRUE}/\code{FALSE} with default \code{FALSE}. } } @@ -71,8 +73,8 @@ sets the \code{sorted} attribute. \code{na.last} argument, by default, is \code{FALSE} for \code{setorder} and \code{setorderv} to be consistent with \code{data.table}'s \code{setkey} and -is \code{TRUE} for \code{x[order(.)]} to be consistent with \code{base::order}. -Only \code{x[order(.)]} can have \code{na.last = NA} as it is a subset operation +is \code{TRUE} for \code{x[order(.)]} and \code{sort_by(x, .)} to be consistent with \code{base::order}. +Only \code{x[order(.)]} (and related \code{sort_by(x, .)}) can have \code{na.last = NA} as it is a subset operation as opposed to \code{setorder} or \code{setorderv} which reorders the data.table by reference. @@ -96,6 +98,11 @@ was started in. By contrast, \code{"america" < "BRAZIL"} is always \code{FALSE} If \code{setorder} results in reordering of the rows of a keyed \code{data.table}, then its key will be set to \code{NULL}. + +\code{sort_by(x, y, \dots)} is the S3 method for the generic \code{sort_by} for \code{data.table}'s. +It uses the same formula or list interfaces as data.frame's \code{sort_by} but internally uses \code{data.table}'s fast ordering, +hence it behaves the same as \code{x[order(.)]} and takes the same optional named arguments and their defaults. + } \value{ The input is modified by reference, and returned (invisibly) so it can be used