diff --git a/NAMESPACE b/NAMESPACE index 2112878f3..ece31e942 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(funnest) S3method("[", data.table) S3method("[<-", data.table) diff --git a/R/setkey.R b/R/setkey.R index 63c6155f6..6c2e7480f 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -356,8 +356,6 @@ CJ = function(..., sorted = TRUE, unique = FALSE) if (unique) l[[i]] = unique(y) } } - nrow = prod( vapply_1i(l, length) ) # lengths(l) will work from R 3.2.0 - if (nrow > .Machine$integer.max) stop(gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max, domain='R-data.table')) l = .Call(Ccj, l) setDT(l) l = setalloccol(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since diff --git a/R/wrappers.R b/R/wrappers.R index 5fec33a92..a19eab598 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -11,4 +11,6 @@ fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.l colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) coerceFill = function(x) .Call(CcoerceFillR, x) +funnest = function(x, cols = which(vapply_1b(x, is.list))) setDT(.Call(Cunnest, x, cols))[] + testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3233c4f94..fe83921ec 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12343,7 +12343,7 @@ unlink(f) test(1882.1, .Machine$integer.max, 2147483647L) # same on all platforms and very unlikely to change in R (which is good) test(1882.2, ceiling(.Machine$integer.max^(1/3)), 1291) v = seq_len(1291L) -test(1882.3, CJ(v, v, v), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647") +test(1882.3, CJ(v, v, v), error="Cross product of elements provided for cross-join would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647") # no re-read for particular file, #2509 if (test_R.utils) test(1883, fread(testDir("SA2-by-DJZ.csv.gz"), verbose=TRUE, header=FALSE)[c(1,2,1381,.N),], @@ -16726,6 +16726,72 @@ DT = data.table( s4class(x=2L, y="yes", z=1))) test(2130.03, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) +# funnest +x = setDT(list(V1=1:2, V2=c(3,4), V3=list(1:3, 1:2), V4=list(1L, 1:3))) +ans = data.table( + V1 = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), + V2 = c(3, 3, 3, 4, 4, 4, 4, 4, 4), + V3 = c(1L, 2L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), + V4 = c(1L, 1L, 1L, 1L, 2L, 3L, 1L, 2L, 3L) +) +test(2131.01, funnest(x), ans) +x[ , V1 := letters[V1]] +ans[ , V1 := letters[V1]] +test(2131.02, funnest(x), ans) +x[ , V1 := factor(V1)] +ans[ , V1 := factor(V1)] +test(2131.03, funnest(x), ans) + +x[ , e := expression(1, 2)] +test(2131.04, funnest(x), error='Unsupported column type') +x[ , e := NULL] + +x[ , c('r', 'z') := .(as.raw(0), 0+1i)] +ans[ , c('r', 'z') := .(as.raw(0), 0+1i)] +test(2131.05, funnest(x), ans) +x[ , c('r', 'z') := NULL] +ans[ , c('r', 'z') := NULL] + +x[ , V3 := .(lapply(V3, function(i) letters[i]))] +ans[ , V3 := letters[V3]] +test(2131.06, funnest(x), ans) + +x[ , V3 := .(lapply(V3, factor))] +ans[ , V3 := factor(V3)] +test(2131.07, funnest(x), ans) + +x[1L, V3 := .(list(expression(1)))] +test(2131.08, funnest(x), error="Type 'expression' not supported") + +x[1L, V3 := .(list(1:3))] +ans[1:3, V3 := factor(1:3)] +ans[ , V3 := factor(V3, levels = c('1', '2', '3', 'a', 'b'))] +test(2131.09, funnest(x), ans) + +x[2L, V3 := .(list(c('a', 'b')))] +ans[ , V3 := as.character(V3)] +test(2131.10, funnest(x), ans) + +ans = unique(ans[ , !'V4'])[ , V4 := .(rep(x$V4, 3:2))] +test(2131.11, funnest(x, cols=3L), ans) +test(2131.12, funnest(x, cols=2:3), ans) +test(2131.13, funnest(x, cols='a'), error='cols must be an integer vector, got') +test(2131.14, funnest(x, cols=10L), error='cols to unnest must be in [1, ncol(x)=4]') +test(2131.15, funnest(x, cols=1L), x) +test(2131.16, address(funnest(x, cols=1L)) != address(x)) + +x[ , V4 := NULL] +ans[ , V4 := NULL] +ans = unique(ans) +test(2131.17, funnest(x), ans) + +test(2131.18, funnest(1), error='Input to funnest must be a data.table') +x = data.table(a=1) +test(2131.19, funnest(x), x) +test(2131.20, address(funnest(x)) != address(x)) + +x[ , e := expression(2)] +test(2131.21, funnest(x, cols=2L), error='Unsupported type for unnesting') ######################## # Add new tests here # diff --git a/man/unnest.Rd b/man/unnest.Rd new file mode 100644 index 000000000..67998269e --- /dev/null +++ b/man/unnest.Rd @@ -0,0 +1,27 @@ +\name{funnest} +\alias{funnest} +\title{ Unnest/explode list columns } +\description{ +For tables with non-rectangular columns (i.e., \code{list}), \code{funnest} "stretches" the table by creating a row for each list element, while also preserving the structure of rectangular columns. Akin to \code{EXPLODE} in U-SQL or HiveQL/SparkQL or \code{UNNEST} from Presto or BigQuery, and similar to \code{\link{melt}} -- both reshape "long", but \code{funnest} does so for "ragged" tables more naturally. +} +\usage{ + funnest(x, cols = which(vapply_1b(x, is.list))) +} +\arguments{ + \item{x}{ A \code{data.table} } + \item{cols}{ An \code{integer} vector of column indices of which columns to unnest; defaults to all \code{list} columns. Can be useful for unnesting only a subset of a table's \code{list} columns. Note that non-\code{list} columns are skipped; if there are no \code{list} columns provided, a \code{\link{copy}} of the table is returned. } +} +\details{ + By default, when \code{length(cols) > 1L}, a \emph{cartesian unnest} is performed, that is, the cross-product (\emph{a la} \code{\link{CJ}}) of each row's list elements is returned. If there are two columns in \code{cols}, \code{A} and \code{B}, the output will thus have \code{sum(lengths(A) * lengths(B))} rows. +} +\value{ +A \code{data.table} +} +\seealso{ + \code{\link{CJ}}, \code{\link{rbindlist}} +} +\examples{ +x = setDT(list(V1 = 1:2, V2 = 3:4, V3 = list(1:3, 1:2), V4 = list(1L, 1:3))) +funnest(x) +} +\keyword{ data } diff --git a/src/cj.c b/src/cj.c index c312c43b6..8702a929f 100644 --- a/src/cj.c +++ b/src/cj.c @@ -3,9 +3,12 @@ SEXP cj(SEXP base_list) { int ncol = LENGTH(base_list); SEXP out = PROTECT(allocVector(VECSXP, ncol)); - int nrow = 1; - // already confirmed to be less than .Machine$integer.max at R level - for (int j=0; j INT_MAX) + error(_("Cross product of elements provided for cross-join would result in %.0f rows which exceeds .Machine$integer.max == %d"), nrow_dbl, INT_MAX); + int nrow = (int) nrow_dbl; int eachrep = 1; for (int j=ncol-1; j>=0; --j) { SEXP source = VECTOR_ELT(base_list, j), target; @@ -20,7 +23,7 @@ SEXP cj(SEXP base_list) { case INTSXP: { const int *restrict sourceP = INTEGER(source); int *restrict targetP = INTEGER(target); - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads()) if (thislen > OMP_MIN_VALUE) // default static schedule so two threads won't write to same cache line in last column // if they did write to same cache line (and will when last column's thislen is small) there's no correctness issue for (int i=0; i OMP_MIN_VALUE) for (int i=1; i OMP_MIN_VALUE) for (int i=0; i OMP_MIN_VALUE) for (int i=1; i OMP_MIN_VALUE) for (int i=0; i OMP_MIN_VALUE) for (int i=1; i p) + error(_("cols to unnest must be in [1, ncol(x)=%d], but cols[%d]=%d"), p, i, j); + switch(TYPEOF(VECTOR_ELT(x, j-1))) { + case RAWSXP : + case LGLSXP : + case INTSXP : + case REALSXP : + case CPLXSXP : + case STRSXP : break; + case VECSXP : { + lcols[lk++] = j-1; // move to 0-based + } break; + default: + error(_("Unsupported type for unnesting: '%s'"), type2char(TYPEOF(VECTOR_ELT(x, j-1)))); + } + } + if (lk == 0) + return (duplicate(x)); + + int row_counts[n]; + + /* unnest the specified cols; each row is expanded with cj, + * then the end result is concatentated with rbindlist. in this way, + * we can let cj handle the crossing logic and rbindlist handle such + * things as type coercion + */ + SEXP cj_rowwise = PROTECT(allocVector(VECSXP, n)); + for (int i=0; i -#include // for isdigit SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) {