From a8e023004e62cff4c6570f56f9a3475c91d97f8b Mon Sep 17 00:00:00 2001 From: 2005m Date: Fri, 2 Aug 2019 03:23:06 +0100 Subject: [PATCH] fifelse - Issue #3657 (#3678) --- DESCRIPTION | 3 +- NAMESPACE | 1 + NEWS.md | 22 +++++++ R/coalesce.R | 4 -- R/test.data.table.R | 20 +++++-- R/wrappers.R | 9 +++ inst/tests/tests.Rraw | 85 +++++++++++++++++++++++++- man/fifelse.Rd | 88 +++++++++++++++++++++++++++ src/fifelse.c | 136 ++++++++++++++++++++++++++++++++++++++++++ src/init.c | 2 + 10 files changed, 357 insertions(+), 13 deletions(-) delete mode 100644 R/coalesce.R create mode 100644 R/wrappers.R create mode 100644 man/fifelse.Rd create mode 100644 src/fifelse.c diff --git a/DESCRIPTION b/DESCRIPTION index f8addc857..6beda4f73 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,8 @@ Authors@R: c( person("@javrucebo","", role="ctb"), person("@marc-outins","", role="ctb"), person("Roy","Storey", role="ctb"), - person("Manish","Saraswat", role="ctb")) + person("Manish","Saraswat", role="ctb"), + person("Morgan","Jacob", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml diff --git a/NAMESPACE b/NAMESPACE index bea6db788..f73c79def 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(setorder, setorderv) export(setNumericRounding, getNumericRounding) export(chmatch, "%chin%", chorder, chgroup) export(rbindlist) +export(fifelse) export(fread) export(fwrite) export(foverlaps) diff --git a/NEWS.md b/NEWS.md index 411a58985..030207708 100644 --- a/NEWS.md +++ b/NEWS.md @@ -135,6 +135,28 @@ 20. `setkey`, `[key]by=` and `on=` in verbose mode (`options(datatable.verbose=TRUE)`) now detect any columns inheriting from `Date` which are stored as 8 byte double, test if any fractions are present, and if not suggest using a 4 byte integer instead (such as `data.table::IDate`) to save space and time, [#1738](https://github.com/Rdatatable/data.table/issues/1738). In future this could be upgraded to `message` or `warning` depending on feedback. +21. New function `fifelse(test, yes, no)` has been implemented in C by Morgan Jacob, [#3657](https://github.com/Rdatatable/data.table/issues/3657). It is comparable to `base::ifelse`, `dplyr::if_else`, `hutils::if_else`, and (forthcoming) [`vctrs::if_else()`](https://vctrs.r-lib.org/articles/stability.html#ifelse). It returns a vector of the same length as `test` but unlike `base::ifelse` the output type is consistent with those of `yes` and `no`. Please see `?data.table::fifelse` for more details. + + ```R + # default 4 threads on a laptop with 16GB RAM and 8 logical CPU + x = sample(c(TRUE,FALSE), 3e8, replace=TRUE) # 1GB + microbenchmark::microbenchmark( + base::ifelse(x, 7L, 11L), + dplyr::if_else(x, 7L, 11L), + hutils::if_else(x, 7L, 11L), + data.table::fifelse(x, 7L, 11L), + times = 5L, unit="s" + ) + # Unit: seconds + # expr min med max neval + # base::ifelse(x, 7L, 11L) 8.5 8.6 8.8 5 + # dplyr::if_else(x, 7L, 11L) 9.4 9.5 9.7 5 + # hutils::if_else(x, 7L, 11L) 2.6 2.6 2.7 5 + # data.table::fifelse(x, 7L, 11L) 1.5 1.5 1.6 5 # setDTthreads(1) + # data.table::fifelse(x, 7L, 11L) 0.8 0.8 0.9 5 # setDTthreads(2) + # data.table::fifelse(x, 7L, 11L) 0.4 0.4 0.5 5 # setDTthreads(4) + ``` + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/coalesce.R b/R/coalesce.R deleted file mode 100644 index b3f9e133c..000000000 --- a/R/coalesce.R +++ /dev/null @@ -1,4 +0,0 @@ - -coalesce = function(...) .Call(Ccoalesce, list(...), FALSE) -setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) - diff --git a/R/test.data.table.R b/R/test.data.table.R index deca3b720..c82dbe744 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -368,12 +368,20 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,output=NULL,message=NULL) { # For test 617 on r-prerel-solaris-sparc on 7 Mar 2013 # nocov start if (!fail) { - cat("Test",numStr,"ran without errors but failed check that x equals y:\n") - cat("> x =",deparse(xsub),"\n") - if (is.data.table(x)) compactprint(x) else {cat("First 6 of ", length(x)," (type '", typeof(x), "'): ", sep=""); print(head(x))} - cat("> y =",deparse(ysub),"\n") - if (is.data.table(y)) compactprint(y) else {cat("First 6 of ", length(y)," (type '", typeof(y), "'): ", sep=""); print(head(y))} - if (!isTRUE(all.equal.result)) cat(all.equal.result,sep="\n") + cat("Test", numStr, "ran without errors but failed check that x equals y:\n") + failPrint = function(x, xsub) { + cat(">", substitute(x), "=", xsub, "\n") + if (is.data.table(x)) compactprint(x) else { + nn = length(x) + cat(sprintf("First %d of %d (type '%s'): \n", min(nn, 6L), length(x), typeof(x))) + # head.matrix doesn't restrict columns + if (length(d <- dim(x))) do.call(`[`, c(list(x, drop = FALSE), lapply(pmin(d, 6L), seq_len))) + else print(head(x)) + } + } + failPrint(x, deparse(xsub)) + failPrint(y, deparse(ysub)) + if (!isTRUE(all.equal.result)) cat(all.equal.result, sep="\n") fail = TRUE } # nocov end diff --git a/R/wrappers.R b/R/wrappers.R new file mode 100644 index 000000000..7beac4ce7 --- /dev/null +++ b/R/wrappers.R @@ -0,0 +1,9 @@ + +# Very small (e.g. one line) R functions that just call C. +# One file wrappers.R to avoid creating lots of small .R files. + +coalesce = function(...) .Call(Ccoalesce, list(...), FALSE) +setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) + +fifelse = function(test, yes, no) .Call(CfifelseR,test, yes, no) + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dbfb740e6..ddecbf312 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15424,9 +15424,90 @@ test(2071.07, any_na(data.table(as.raw(0L))), FALSE) test(2071.08, any_na(data.table(c(1+1i, NA)))) test(2071.09, any_na(data.table(expression(1))), error="Unsupported column type 'expression'") test(2071.10, dcast(data.table(a=1, b=1, l=list(list(1))), a ~ b, value.var='l'), - data.table(a=1, `1`=list(list(1)), key='a')) + data.table(a=1, `1`=list(list(1)), key='a')) test(2071.11, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), - data.table(a=1, `2`=3, key='a')) + data.table(a=1, `2`=3, key='a')) + +# fifelse, #3657 +test_vec = -5L:5L < 0L +test_vec_na = c(test_vec, NA) +out_vec = rep(1:0, 5:6) +out_vec_na = c(out_vec, NA_integer_) +test(2072.001, fifelse(test_vec, 1L, 0L), out_vec) +test(2072.002, fifelse(test_vec, 1, 0), as.numeric(out_vec)) +test(2072.003, fifelse(test_vec, TRUE, FALSE), as.logical(out_vec)) +test(2072.004, fifelse(test_vec, "1", "0"), as.character(out_vec)) +test(2072.005, fifelse(test_vec_na, TRUE, NA), c(rep(TRUE,5L), rep(NA,7L))) +test(2072.006, fifelse(test_vec, rep(1L,11L), rep(0L,11L)), out_vec) +test(2072.007, fifelse(test_vec, rep(1L,11L), 0L), out_vec) +test(2072.008, fifelse(test_vec, 1L, rep(0L,11L)), out_vec) +test(2072.009, fifelse(test_vec, rep(1L,11L), rep(0L,10L)), error="Length of 'no' is 10 but must be 1 or length of 'test' (11).") +test(2072.010, fifelse(test_vec, rep(1,10L), rep(0,11L)), error="Length of 'yes' is 10 but must be 1 or length of 'test' (11).") +test(2072.011, fifelse(test_vec, rep(TRUE,10L), rep(FALSE,10L)), error="Length of 'yes' is 10 but must be 1 or length of 'test' (11).") +test(2072.012, fifelse(0:1, rep(TRUE,2L), rep(FALSE,2L)), error="Argument 'test' must be logical.") +test(2072.013, fifelse(test_vec, TRUE, "FALSE"), error="'yes' is of type logical but 'no' is of type character. Please") +test(2072.014, fifelse(test_vec, list(1),list(2,4)), error="Length of 'no' is 2 but must be 1 or length of 'test' (11).") +test(2072.015, fifelse(test_vec, list(1,3),list(2,4)), error="Length of 'yes' is 2 but must be 1 or length of 'test' (11).") +test(2072.016, fifelse(test_vec, list(1), 0), as.list(as.numeric(out_vec))) +test(2072.017, fifelse(test_vec, 1, list(0)), as.list(as.numeric(out_vec))) +## Jan 1 - 5, 2011 +date_vec = as.Date(14975:14979, origin = '1970-01-01') +test(2072.018, fifelse(date_vec == "2011-01-01", date_vec - 1L, date_vec), + c(date_vec[1L] - 1L, date_vec[2:5])) +test(2072.019, fifelse(c(TRUE,FALSE,TRUE,TRUE,FALSE), factor(letters[1:5]), factor("a", levels=letters[1:5])), + factor(c("a","a","c","d","a"), levels=letters[1:5])) +test(2072.020, fifelse(test_vec_na, 1L, 0L), out_vec_na) +test(2072.021, fifelse(test_vec_na, rep(1L,12L), 0L), out_vec_na) +test(2072.022, fifelse(test_vec_na, rep(1L,12L), rep(0L,12L)), out_vec_na) +test(2072.023, fifelse(test_vec_na, 1L, rep(0L,12L)), out_vec_na) +test(2072.024, fifelse(test_vec_na, 1, 0), as.numeric(out_vec_na)) +test(2072.025, fifelse(test_vec_na, rep(1,12L), 0), as.numeric(out_vec_na)) +test(2072.026, fifelse(test_vec_na, rep(1,12L), rep(0,12L)), as.numeric(out_vec_na)) +test(2072.027, fifelse(test_vec_na, 1, rep(0,12L)), as.numeric(out_vec_na)) +test(2072.028, fifelse(test_vec_na, TRUE, rep(FALSE,12L)), as.logical(out_vec_na)) +test(2072.029, fifelse(test_vec_na, rep(TRUE,12L), FALSE), as.logical(out_vec_na)) +test(2072.030, fifelse(test_vec_na, rep(TRUE,12L), rep(FALSE,12L)), as.logical(out_vec_na)) +test(2072.031, fifelse(test_vec_na, "1", rep("0",12L)), as.character(out_vec_na)) +test(2072.032, fifelse(test_vec_na, rep("1",12L), "0"), as.character(out_vec_na)) +test(2072.033, fifelse(test_vec_na, rep("1",12L), rep("0",12L)), as.character(out_vec_na)) +test(2072.034, fifelse(test_vec_na, "1", "0"), as.character(out_vec_na)) +test(2072.035, fifelse(test_vec, as.Date("2011-01-01"), FALSE), error="'yes' is of type double but 'no' is of type logical. Please") +test(2072.036, fifelse(test_vec_na, 1+0i, 0+0i), as.complex(out_vec_na)) +test(2072.037, fifelse(test_vec_na, rep(1+0i,12L), 0+0i), as.complex(out_vec_na)) +test(2072.038, fifelse(test_vec_na, rep(1+0i,12L), rep(0+0i,12L)), as.complex(out_vec_na)) +test(2072.039, fifelse(test_vec_na, 1+0i, rep(0+0i,12L)), as.complex(out_vec_na)) +test(2072.040, fifelse(test_vec, as.raw(0), as.raw(1)), error="Type raw is not supported.") +test(2072.041, fifelse(TRUE,1,as.Date("2019-07-07")), error="'yes' has different class than 'no'. Please") +test(2072.042, fifelse(TRUE,1L,factor(letters[1])), error="'yes' has different class than 'no'. Please") +test(2072.043, fifelse(TRUE, list(1:5), list(5:1)), list(1:5)) +test(2072.044, fifelse(as.logical(NA), list(1:5), list(5:1)), list(NULL)) +test(2072.045, fifelse(FALSE, list(1:5), list(5:1)), list(5:1)) +test(2072.046, fifelse(TRUE, data.table(1:5), data.table(5:1)), data.table(1:5)) +test(2072.047, fifelse(FALSE, data.table(1:5), data.table(5:1)), data.table(5:1)) +test(2072.048, fifelse(TRUE, data.frame(1:5), data.frame(5:1)), data.frame(1:5)) +test(2072.049, fifelse(FALSE, data.frame(1:5), data.frame(5:1)), data.frame(5:1)) +test(2072.050, fifelse(c(TRUE,FALSE), list(1:5,6:10), list(10:6,5:1)), list(1:5,5:1)) +test(2072.051, fifelse(c(NA,TRUE), list(1:5,6:10), list(10:6,5:1)), list(NULL,6:10)) +test(2072.052, fifelse(c(FALSE,TRUE), list(1:5,6:10), list(10:6,5:1)), list(10:6,6:10)) +test(2072.053, fifelse(c(NA,TRUE), list(1:5), list(10:6,5:1)), list(NULL,1:5)) +test(2072.054, fifelse(c(NA,TRUE), list(1:5,6:10), list(5:1)), list(NULL,6:10)) +test(2072.055, fifelse(c(FALSE,TRUE), list(TRUE), list(10:6,5:1)), list(10:6,TRUE)) +test(2072.056, fifelse(c(FALSE,TRUE), list(as.Date("2019-07-07")), list(10:6,5:1)), list(10:6,as.Date("2019-07-07"))) +test(2072.057, fifelse(c(FALSE,TRUE), list(factor(letters[1:5])), list(10:6,5:1)), list(10:6,factor(letters[1:5]))) +test(2072.058, fifelse(c(NA,FALSE), list(1:5), list(10:6,5:1)), list(NULL,5:1)) +test(2072.059, fifelse(c(NA,FALSE), list(1:5,6:10), list(5:1)), list(NULL,5:1)) +test(2072.060, fifelse(c(NA,FALSE), list(1:5), list(5:1)), list(NULL,5:1)) +test(2072.061, fifelse(c(TRUE,FALSE), list(1L), 0L), list(1L,0L)) +test(2072.062, fifelse(c(TRUE,FALSE), 1L, list(0L)), list(1L,0L)) +test(2072.063, fifelse(c(TRUE,FALSE), factor(c("a","b")), factor(c("a","c"))), error="'yes' and 'no' are both type factor but their levels are different") +test(2072.064, fifelse(c(TRUE, TRUE, TRUE, FALSE, FALSE), factor(NA, levels=letters[1:5]), factor(letters[1:5])), + factor(c(NA,NA,NA,"d","e"),levels=letters[1:5])) +test(2072.065, fifelse(c(TRUE, TRUE, TRUE, FALSE, NA, FALSE), factor(NA, levels=letters[1:6]), factor(letters[1:6])), + factor(c(NA,NA,NA,"d",NA,"f"),levels=letters[1:6])) +test(2072.066, fifelse(c(TRUE, TRUE, TRUE, FALSE, NA, FALSE), factor(letters[1:6]), factor(NA, levels=letters[1:6])), + factor(c("a","b","c",NA,NA,NA), levels=letters[1:6])) +test(2072.067, fifelse(c(TRUE, NA, TRUE, FALSE, FALSE, FALSE), factor(NA), factor(NA)), + factor(c(NA,NA,NA,NA,NA,NA))) ################################### # Add new tests above this line # diff --git a/man/fifelse.Rd b/man/fifelse.Rd new file mode 100644 index 000000000..1e0b6544e --- /dev/null +++ b/man/fifelse.Rd @@ -0,0 +1,88 @@ +\name{fifelse} +\alias{fifelse} +\title{ Fast ifelse } +\description{ +\code{data.table::fifelse} is comparable to \code{base::ifelse}, \code{dplyr::if_else} and \code{hutils::if_else}. +Like these functions, it returns a value with the same length as \code{test} and filled with value from \code{yes} or \code{no}. +This function is mostly written in C for speed and unlike \code{base::ifelse} the output type is consistent with those of \code{yes} and \code{no}. +} +\usage{ + fifelse(test, yes, no) +} +\arguments{ + \item{test}{ A logical vector } + \item{yes, no}{ Values to return depending on \code{TRUE}/\code{FALSE} element of \code{test}. They must be the same type and be either length \code{1} or the same length of \code{test}. Attributes are copied from \code{yes} to the output.} +} +\value{ +A vector of the same length as \code{test} and attributes as \code{yes}. Data values are taken from the values of \code{yes} and \code{no}. +} +\examples{ +# In the following 2 examples, one can see the contrast between ifelse and +# fifelse. Unlike ifelse, fifelse preserves the +# type and class of the inputs. Attributes are are taken from the "yes" argument. + +### Example 1 - Date class preserved +dates = as.Date(c("2011-01-01","2011-01-02","2011-01-03","2011-01-04","2011-01-05")) +ifelse(dates == "2011-01-01", dates - 1, dates) +# [1] 14974 14976 14977 14978 14979 +fifelse(dates == "2011-01-01", dates - 1, dates) +# [1] "2010-12-31" "2011-01-02" "2011-01-03" "2011-01-04" "2011-01-05" + +### Example 2 - Factor class preserved +v = factor(letters[1:3]) +base::ifelse(c(TRUE,FALSE,TRUE), v, factor("a",levels=levels(v))) +# [1] 1 1 3 + fifelse(c(TRUE,FALSE,TRUE), v, factor("a",levels=levels(v))) +# [1] a a c +# Levels: a b c + +\dontrun{ +# Example 3: +# Unlike dplyr::if_else and hutils::if_else, fifelse and ifelse +# allow singleton replacements to be en-listed ( i.e. wrapped correctly in list()). + +ifelse(c(TRUE, FALSE), 1L, list(0L)) +# [[1]] +# [1] 1 +# +# [[2]] +# [1] 0 + +data.table::fifelse(c(TRUE, FALSE), 1L, list(0L)) +# [[1]] +# [1] 1 +# +# [[2]] +# [1] 0 + +dplyr::if_else(c(TRUE, FALSE), 1L, list(0L)) +# Error: `false` must be an integer vector, not a list +# Call `rlang::last_error()` to see a backtrace + +hutils::if_else(c(TRUE, FALSE), 1L, list(0L)) +# Error in hutils::if_else(c(TRUE, FALSE), 1L, list(0L)) : +# typeof(false) == list, yet typeof(true) == integer. Both true and false must have the same type. + +# The above is useful given that data.table allows list columns as shown below: +DT1 = data.table(a = 0:5, b = list(1, list(2:3), list(4:6), list(6:4), list(3:2), 1)) +DT2 = data.table(a = 0:5, b = list(1, list(2:3), list(4:6), list(6:4), list(3:2), 1)) +all.equal(DT1[ , b := fifelse(a > 2, b, 0)],DT2[ , b := ifelse(a > 2, b, 0)]) +# [1] TRUE + +# Example 4 : +# data.table:fifelse is fast... +# The below code has been run on a desktop with 32GB RAM and 8 logical CPU +x <- -3e8L:3e8 < 0 # 2.4GB +system.time(y1 <- fifelse(x, 1L, 0L)) +system.time(y2 <- hutils::if_else(x, 1L, 0L)) +system.time(y3 <- ifelse(x, 1L, 0L)) +system.time(y4 <- dplyr::if_else(x, 1L, 0L)) +identical(y1,y2) && identical(y1,y3) && identical(y1,y4) +# user system elapsed (seconds) +# 0.55 0.78 1.33 # data.table v1.12.3 +# 2.52 1.44 3.95 # hutils v1.5.0 +# 9.46 5.80 15.26 # base v3.6.0 +# 11.11 9.25 20.38 # dplyr v0.8.3 +} +} +\keyword{ data } diff --git a/src/fifelse.c b/src/fifelse.c new file mode 100644 index 000000000..0ee1fa3af --- /dev/null +++ b/src/fifelse.c @@ -0,0 +1,136 @@ +#include "data.table.h" + +SEXP fifelseR(SEXP l, SEXP a, SEXP b) +{ + // l is the test + // a is what to do in case l is TRUE + // b is what to do in case l is FALSE + + if (!isLogical(l)) error("Argument 'test' must be logical."); + + const int64_t len0 = xlength(l); + const int64_t len1 = xlength(a); + const int64_t len2 = xlength(b); + SEXPTYPE ta = TYPEOF(a); + SEXPTYPE tb = TYPEOF(b); + int nprotect = 0; + + // Check if same storage type and do en-listing of singleton + if (ta != tb) + { + if (ta == VECSXP && (tb == INTSXP || tb == REALSXP || tb == LGLSXP || tb == CPLXSXP || tb == STRSXP)) + { + if (len2 == 1) + { + SEXP tmp = PROTECT(allocVector(VECSXP,1)); nprotect++; + SET_VECTOR_ELT(tmp, 0, b); + b = tmp; + tb = VECSXP; + } + } else if (tb == VECSXP && (ta == INTSXP || ta == REALSXP || ta == LGLSXP || ta == CPLXSXP || ta == STRSXP)){ + if (len1 == 1) + { + SEXP tmp = PROTECT(allocVector(VECSXP,1)); nprotect++; + SET_VECTOR_ELT(tmp, 0, a); + a = tmp; + ta = VECSXP; + } + } else { + error("'yes' is of type %s but 'no' is of type %s. Please make sure candidate replacements are of the same type.", type2char(ta), type2char(tb)); + } + } + + if (!R_compute_identical(PROTECT(getAttrib(a,R_ClassSymbol)), PROTECT(getAttrib(b,R_ClassSymbol)), 0)) + error("'yes' has different class than 'no'. Please make sure that candidate replacements have the same class."); + UNPROTECT(2); + + if (isFactor(a)) { + if (!R_compute_identical(PROTECT(getAttrib(a,R_LevelsSymbol)), PROTECT(getAttrib(b,R_LevelsSymbol)), 0)) + error("'yes' and 'no' are both type factor but their levels are different."); + UNPROTECT(2); + } + + // Check here the length of the different input variables. + if (len1!=1 && len1!=len0) error("Length of 'yes' is %lld but must be 1 or length of 'test' (%lld).", len1, len0); + if (len2!=1 && len2!=len0) error("Length of 'no' is %lld but must be 1 or length of 'test' (%lld).", len2, len0); + const int64_t amask = len1>1 ? INT64_MAX : 0; + const int64_t bmask = len2>1 ? INT64_MAX : 0; + + const int *restrict pl = LOGICAL(l); + SEXP ans = PROTECT(allocVector(TYPEOF(a), len0)); nprotect++; + copyMostAttrib(a, ans); + + switch(TYPEOF(a)) { + case LGLSXP: { + int *restrict pans = LOGICAL(ans); + const int *restrict pa = LOGICAL(a); + const int *restrict pb = LOGICAL(b); + #pragma omp parallel for num_threads(getDTthreads()) + for (int64_t i=0; i