From f88e69bd91d5bf9b118d8b981f11cc87feb357ef Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 2 Jan 2020 22:06:45 +0800 Subject: [PATCH 01/19] prod_int function (probably not safe enough to use --- R/wrappers.R | 2 ++ src/data.table.h | 1 + src/init.c | 2 ++ src/utils.c | 14 ++++++++++++++ 4 files changed, 19 insertions(+) diff --git a/R/wrappers.R b/R/wrappers.R index 5fec33a92..67f75b381 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -11,4 +11,6 @@ fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.l colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) coerceFill = function(x) .Call(CcoerceFillR, x) +prodInt = function(x) .Call(Cprodint, x) + testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/src/data.table.h b/src/data.table.h index 3c7176d6b..d424d8c9c 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -230,6 +230,7 @@ bool islocked(SEXP x); SEXP islockedR(SEXP x); bool need2utf8(SEXP x); SEXP coerceUtf8IfNeeded(SEXP x); +SEXP prod_int(SEXP x); // types.c char *end(char *start); diff --git a/src/init.c b/src/init.c index aed2da3db..3315fed04 100644 --- a/src/init.c +++ b/src/init.c @@ -119,6 +119,7 @@ SEXP lock(); SEXP unlock(); SEXP islockedR(); SEXP allNAR(); +SEXP prod_int(); // .Externals SEXP fastmean(); @@ -211,6 +212,7 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, +{"Cprodint", (DL_FUNC) &prod_int, -1}, {NULL, NULL, 0} }; diff --git a/src/utils.c b/src/utils.c index 0825c399b..aa23eee0b 100644 --- a/src/utils.c +++ b/src/utils.c @@ -355,3 +355,17 @@ SEXP coerceUtf8IfNeeded(SEXP x) { UNPROTECT(1); return(ans); } + +// base::prod always returns double; return int directly +SEXP prod_int(SEXP x) { + if (TYPEOF(x) != INTSXP) + error("Internal error: invalid input type %s", type2char(TYPEOF(x))); // # nocov + SEXP ans = PROTECT(allocVector(INTSXP, 1)); + int cumprod = 1; + int *xp = INTEGER(x); + for (int i=0; i < LENGTH(x); i++) { + cumprod *= xp[i]; + } + INTEGER(ans)[0] = cumprod; + return ans; +} From ade2a94b4d7edf0306e7a1d5f62887daff7d20a7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 3 Jan 2020 00:49:36 +0800 Subject: [PATCH 02/19] initial working version of unnest --- NAMESPACE | 1 + R/wrappers.R | 2 +- src/data.table.h | 4 +- src/init.c | 4 +- src/unnest.c | 223 +++++++++++++++++++++++++++++++++++++++++++++++ src/utils.c | 14 --- 6 files changed, 230 insertions(+), 18 deletions(-) create mode 100644 src/unnest.c diff --git a/NAMESPACE b/NAMESPACE index 2112878f3..159a3e394 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(unnest) S3method("[", data.table) S3method("[<-", data.table) diff --git a/R/wrappers.R b/R/wrappers.R index 67f75b381..0fe302601 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -11,6 +11,6 @@ fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.l colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) coerceFill = function(x) .Call(CcoerceFillR, x) -prodInt = function(x) .Call(Cprodint, x) +unnest = function(x) .Call(Cunnest, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/src/data.table.h b/src/data.table.h index d424d8c9c..e1d3c0fc4 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -213,6 +213,9 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP // coalesce.c SEXP coalesce(SEXP x, SEXP inplace); +// unnest.c +SEXP unnest(SEXP x); + // utils.c bool isRealReallyInt(SEXP x); SEXP isReallyReal(SEXP x); @@ -230,7 +233,6 @@ bool islocked(SEXP x); SEXP islockedR(SEXP x); bool need2utf8(SEXP x); SEXP coerceUtf8IfNeeded(SEXP x); -SEXP prod_int(SEXP x); // types.c char *end(char *start); diff --git a/src/init.c b/src/init.c index 3315fed04..47b7fc3dd 100644 --- a/src/init.c +++ b/src/init.c @@ -119,7 +119,7 @@ SEXP lock(); SEXP unlock(); SEXP islockedR(); SEXP allNAR(); -SEXP prod_int(); +SEXP unnest(); // .Externals SEXP fastmean(); @@ -212,7 +212,7 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, -{"Cprodint", (DL_FUNC) &prod_int, -1}, +{"Cunnest", (DL_FUNC) &unnest, -1}, {NULL, NULL, 0} }; diff --git a/src/unnest.c b/src/unnest.c new file mode 100644 index 000000000..c638ece5e --- /dev/null +++ b/src/unnest.c @@ -0,0 +1,223 @@ +#include "data.table.h" +#include + +SEXP unnest(SEXP x) { + int n = LENGTH(VECTOR_ELT(x, 0)); + int p = LENGTH(x); + int row_counts[n]; + SEXPTYPE col_types[p]; + + bool all_atomic = true; + + for (int j=0; j # rows corresponding to i in output + for (int i=0; i INT_MAX) + error("Implied number of unnested rows from row %d (%.0f) exceeds %d, the maximum currently allowed.", i, this_row, INT_MAX); + row_counts[i] = (int) this_row; + } + + int out_rows=0; + int i=0; + while (i < n) { + if (out_rows > INT_MAX - row_counts[i]) { + Rprintf("out_rows=%d; row_counts[i]=%d\n", out_rows, row_counts[i]); + double out_rows_dbl = (double) out_rows; + for (;i Date: Fri, 3 Jan 2020 00:52:19 +0800 Subject: [PATCH 03/19] make note of incorrect cartesian unnesting --- src/unnest.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/unnest.c b/src/unnest.c index c638ece5e..a4740dab9 100644 --- a/src/unnest.c +++ b/src/unnest.c @@ -195,6 +195,9 @@ SEXP unnest(SEXP x) { for (int i=0; i wrong output for (int repi=0; repi Date: Fri, 3 Jan 2020 12:21:43 +0800 Subject: [PATCH 04/19] initial progress migrating to simpler signature --- R/setkey.R | 2 -- R/wrappers.R | 2 +- src/cj.c | 12 +++++++++--- src/data.table.h | 2 +- src/unnest.c | 23 ++++++++++++++++------- 5 files changed, 27 insertions(+), 14 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 63c6155f6..6c2e7480f 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -356,8 +356,6 @@ CJ = function(..., sorted = TRUE, unique = FALSE) if (unique) l[[i]] = unique(y) } } - nrow = prod( vapply_1i(l, length) ) # lengths(l) will work from R 3.2.0 - if (nrow > .Machine$integer.max) stop(gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max, domain='R-data.table')) l = .Call(Ccj, l) setDT(l) l = setalloccol(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since diff --git a/R/wrappers.R b/R/wrappers.R index 0fe302601..ce749ad90 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -11,6 +11,6 @@ fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.l colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) coerceFill = function(x) .Call(CcoerceFillR, x) -unnest = function(x) .Call(Cunnest, x) +unnest = function(x, cols = which(vapply_1b(x, is.list))) .Call(Cunnest, x, cols) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/src/cj.c b/src/cj.c index c312c43b6..857140b4b 100644 --- a/src/cj.c +++ b/src/cj.c @@ -3,9 +3,15 @@ SEXP cj(SEXP base_list) { int ncol = LENGTH(base_list); SEXP out = PROTECT(allocVector(VECSXP, ncol)); - int nrow = 1; - // already confirmed to be less than .Machine$integer.max at R level - for (int j=0; j INT_MAX) { + error(_("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d"), nrow_dbl, INT_MAX); + } else { + nrow = (int) nrow_dbl; + } int eachrep = 1; for (int j=ncol-1; j>=0; --j) { SEXP source = VECTOR_ELT(base_list, j), target; diff --git a/src/data.table.h b/src/data.table.h index e1d3c0fc4..c520943e5 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -214,7 +214,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP SEXP coalesce(SEXP x, SEXP inplace); // unnest.c -SEXP unnest(SEXP x); +SEXP unnest(SEXP x, SEXP cols); // utils.c bool isRealReallyInt(SEXP x); diff --git a/src/unnest.c b/src/unnest.c index a4740dab9..2a72a6af5 100644 --- a/src/unnest.c +++ b/src/unnest.c @@ -1,11 +1,25 @@ #include "data.table.h" #include -SEXP unnest(SEXP x) { +SEXP unnest(SEXP x, SEXP cols) { + int k = LENGTH(cols); + // nothing to unnest -- need to go further, just be sure to copy + if (k == 0) + return (duplicate(x)); int n = LENGTH(VECTOR_ELT(x, 0)); int p = LENGTH(x); + + if (TYPEOF(cols) != INTSXP) + error(_("cols must be an integer vector, got %s"), type2char(TYPEOF(cols))); + int *colp = INTEGER(cols); + for (int i=0; i p) + error(_("cols to unnest must be in [1, ncol(x)=%d], but cols[%d]=%d"), p, i, k); + } + int row_counts[n]; - SEXPTYPE col_types[p]; + + bool all_atomic = true; @@ -28,11 +42,6 @@ SEXP unnest(SEXP x) { } } - // no need to go further, just be sure to copy - if (all_atomic) { - return (duplicate(x)); - } - double this_row; SEXP xj; // find the mapping i -> # rows corresponding to i in output From fa93bee14ff947ac353d45e52f884b092ac488d4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 Jan 2020 11:27:52 +0800 Subject: [PATCH 05/19] extract CJ workhorse to power unnesting --- src/cj.c | 161 ++++++++++++++++++------------------ src/myomp.h | 1 + src/unnest.c | 229 ++++++++++++++++----------------------------------- 3 files changed, 153 insertions(+), 238 deletions(-) diff --git a/src/cj.c b/src/cj.c index 857140b4b..ab317bd5c 100644 --- a/src/cj.c +++ b/src/cj.c @@ -1,5 +1,86 @@ #include "data.table.h" +// workhorse for cycling a vector in the correct pattern, +// essentially rep(x, m, each = n) for appropriate m,n at each j +void cycle_vector(SEXP source, SEXP target, int j, int eachrep, int nrow) { + int thislen = LENGTH(source); + int blocklen = thislen*eachrep; + int ncopy = nrow/blocklen; + switch(TYPEOF(source)) { + case LGLSXP: + case INTSXP: { + const int *restrict sourceP = INTEGER(source); + int *restrict targetP = INTEGER(target); + #pragma omp parallel for num_threads(getDTthreads()) if (nrow >= OMP_MIN_VALUE) + // default static schedule so two threads won't write to same cache line in last column + // if they did write to same cache line (and will when last column's thislen is small) there's no correctness issue + for (int i=0; i= OMP_MIN_VALUE) + for (int i=1; i= OMP_MIN_VALUE) + for (int i=0; i= OMP_MIN_VALUE) + for (int i=1; i= OMP_MIN_VALUE) + for (int i=0; i= OMP_MIN_VALUE) + for (int i=1; i - SEXP unnest(SEXP x, SEXP cols) { int k = LENGTH(cols); // nothing to unnest -- need to go further, just be sure to copy @@ -12,217 +9,129 @@ SEXP unnest(SEXP x, SEXP cols) { if (TYPEOF(cols) != INTSXP) error(_("cols must be an integer vector, got %s"), type2char(TYPEOF(cols))); int *colp = INTEGER(cols); - for (int i=0; i p) - error(_("cols to unnest must be in [1, ncol(x)=%d], but cols[%d]=%d"), p, i, k); - } - - int row_counts[n]; - - - bool all_atomic = true; - - for (int j=0; j p) + error(_("cols to unnest must be in [1, ncol(x)=%d], but cols[%d]=%d"), p, i, j); + switch(TYPEOF(VECTOR_ELT(x, j-1))) { case RAWSXP : case LGLSXP : case INTSXP : case REALSXP : case CPLXSXP : - case STRSXP : - col_types[j] = this_col; - break; + case STRSXP : break; + case VECSXP : { + lcols[lk++] = j-1; // move to 0-based + } break; default: - error(_("Unsupported type: %s"), type2char(this_col)); + error(_("Unsupported type: %s"), type2char(TYPEOF(VECTOR_ELT(x, j)))); } } + int row_counts[n]; + int eachrep[n]; + double this_row; - SEXP xj; - // find the mapping i -> # rows corresponding to i in output - for (int i=0; i # rows corresponding to i in output, + // also check feasibility of output + while (i < n) { this_row = 1; - for (int j=0; j INT_MAX) error("Implied number of unnested rows from row %d (%.0f) exceeds %d, the maximum currently allowed.", i, this_row, INT_MAX); - row_counts[i] = (int) this_row; - } - - int out_rows=0; - int i=0; - while (i < n) { - if (out_rows > INT_MAX - row_counts[i]) { - Rprintf("out_rows=%d; row_counts[i]=%d\n", out_rows, row_counts[i]); + if (out_rows > INT_MAX - this_row) { double out_rows_dbl = (double) out_rows; - for (;i=0; j--) { + Rprintf("j=%d\n", j); + SEXP xj = VECTOR_ELT(x, j), ansj; + if (lj >= 0 && j == lcols[lj]) { // vec col: apply unnesting + // TODO: type bumping for mismatch + SET_VECTOR_ELT(ans, j, ansj=allocVector(TYPEOF(VECTOR_ELT(xj, 0)), out_rows)); for (int i=0; i wrong output - for (int repi=0; repi Date: Sat, 4 Jan 2020 11:29:53 +0800 Subject: [PATCH 06/19] move unnest into cj script (similar logic) --- src/cj.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++ src/data.table.h | 4 +- src/unnest.c | 144 ---------------------------------------------- 3 files changed, 146 insertions(+), 147 deletions(-) delete mode 100644 src/unnest.c diff --git a/src/cj.c b/src/cj.c index ab317bd5c..11634d3bc 100644 --- a/src/cj.c +++ b/src/cj.c @@ -105,3 +105,148 @@ SEXP cj(SEXP base_list) { UNPROTECT(1); return out; } + +SEXP unnest(SEXP x, SEXP cols) { + int k = LENGTH(cols); + // nothing to unnest -- need to go further, just be sure to copy + if (k == 0) + return (duplicate(x)); + int n = LENGTH(VECTOR_ELT(x, 0)); + int p = LENGTH(x); + + if (TYPEOF(cols) != INTSXP) + error(_("cols must be an integer vector, got %s"), type2char(TYPEOF(cols))); + int *colp = INTEGER(cols); + + // ignore non-VECSXP elements of cols; use lcols and lk (list-only versions) + int lcols[k], j; + int lk=0; + for (int i=0; i p) + error(_("cols to unnest must be in [1, ncol(x)=%d], but cols[%d]=%d"), p, i, j); + switch(TYPEOF(VECTOR_ELT(x, j-1))) { + case RAWSXP : + case LGLSXP : + case INTSXP : + case REALSXP : + case CPLXSXP : + case STRSXP : break; + case VECSXP : { + lcols[lk++] = j-1; // move to 0-based + } break; + default: + error(_("Unsupported type: %s"), type2char(TYPEOF(VECTOR_ELT(x, j)))); + } + } + + int row_counts[n]; + int eachrep[n]; + + double this_row; + int out_rows=0, i=0; + // find the mapping i -> # rows corresponding to i in output, + // also check feasibility of output + while (i < n) { + this_row = 1; + for (int jj=0; jj INT_MAX) + error("Implied number of unnested rows from row %d (%.0f) exceeds %d, the maximum currently allowed.", i, this_row, INT_MAX); + if (out_rows > INT_MAX - this_row) { + double out_rows_dbl = (double) out_rows; + for (;i=0; j--) { + Rprintf("j=%d\n", j); + SEXP xj = VECTOR_ELT(x, j), ansj; + if (lj >= 0 && j == lcols[lj]) { // vec col: apply unnesting + // TODO: type bumping for mismatch + SET_VECTOR_ELT(ans, j, ansj=allocVector(TYPEOF(VECTOR_ELT(xj, 0)), out_rows)); + for (int i=0; i p) - error(_("cols to unnest must be in [1, ncol(x)=%d], but cols[%d]=%d"), p, i, j); - switch(TYPEOF(VECTOR_ELT(x, j-1))) { - case RAWSXP : - case LGLSXP : - case INTSXP : - case REALSXP : - case CPLXSXP : - case STRSXP : break; - case VECSXP : { - lcols[lk++] = j-1; // move to 0-based - } break; - default: - error(_("Unsupported type: %s"), type2char(TYPEOF(VECTOR_ELT(x, j)))); - } - } - - int row_counts[n]; - int eachrep[n]; - - double this_row; - int out_rows=0, i=0; - // find the mapping i -> # rows corresponding to i in output, - // also check feasibility of output - while (i < n) { - this_row = 1; - for (int jj=0; jj INT_MAX) - error("Implied number of unnested rows from row %d (%.0f) exceeds %d, the maximum currently allowed.", i, this_row, INT_MAX); - if (out_rows > INT_MAX - this_row) { - double out_rows_dbl = (double) out_rows; - for (;i=0; j--) { - Rprintf("j=%d\n", j); - SEXP xj = VECTOR_ELT(x, j), ansj; - if (lj >= 0 && j == lcols[lj]) { // vec col: apply unnesting - // TODO: type bumping for mismatch - SET_VECTOR_ELT(ans, j, ansj=allocVector(TYPEOF(VECTOR_ELT(xj, 0)), out_rows)); - for (int i=0; i Date: Sat, 4 Jan 2020 17:21:20 +0800 Subject: [PATCH 07/19] fix some bugs, but giving up on this approach for now --- src/cj.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cj.c b/src/cj.c index 11634d3bc..3ad397e13 100644 --- a/src/cj.c +++ b/src/cj.c @@ -17,7 +17,8 @@ void cycle_vector(SEXP source, SEXP target, int j, int eachrep, int nrow) { for (int i=0; i= OMP_MIN_VALUE) for (int i=1; i=0; j--) { Rprintf("j=%d\n", j); SEXP xj = VECTOR_ELT(x, j), ansj; + int outi=0; if (lj >= 0 && j == lcols[lj]) { // vec col: apply unnesting // TODO: type bumping for mismatch SET_VECTOR_ELT(ans, j, ansj=allocVector(TYPEOF(VECTOR_ELT(xj, 0)), out_rows)); for (int i=0; i Date: Sat, 4 Jan 2020 18:13:37 +0800 Subject: [PATCH 08/19] use rbindlist() and cj() internally --- src/cj.c | 59 +++++++++++++++++------------------------------- src/data.table.h | 3 +++ src/rbindlist.c | 1 - 3 files changed, 24 insertions(+), 39 deletions(-) diff --git a/src/cj.c b/src/cj.c index 3ad397e13..e698d6ae7 100644 --- a/src/cj.c +++ b/src/cj.c @@ -137,51 +137,35 @@ SEXP unnest(SEXP x, SEXP cols) { lcols[lk++] = j-1; // move to 0-based } break; default: - error(_("Unsupported type: %s"), type2char(TYPEOF(VECTOR_ELT(x, j)))); + error(_("Unsupported type for unnesting: %s"), type2char(TYPEOF(VECTOR_ELT(x, j)))); } } + if (lk == 0) + return (duplicate(x)); int row_counts[n]; - int eachrep[n]; - double this_row; - int out_rows=0, i=0; - // find the mapping i -> # rows corresponding to i in output, - // also check feasibility of output - while (i < n) { - this_row = 1; - for (int jj=0; jj INT_MAX) - error("Implied number of unnested rows from row %d (%.0f) exceeds %d, the maximum currently allowed.", i, this_row, INT_MAX); - if (out_rows > INT_MAX - this_row) { - double out_rows_dbl = (double) out_rows; - for (;i=0; j--) { - Rprintf("j=%d\n", j); - SEXP xj = VECTOR_ELT(x, j), ansj; - int outi=0; - if (lj >= 0 && j == lcols[lj]) { // vec col: apply unnesting - // TODO: type bumping for mismatch - SET_VECTOR_ELT(ans, j, ansj=allocVector(TYPEOF(VECTOR_ELT(xj, 0)), out_rows)); - for (int i=0; i -#include // for isdigit SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) { From 89334778fc64e2382219d160d321bb9178389967 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 Jan 2020 18:23:01 +0800 Subject: [PATCH 09/19] no need for splitting out cj logic anymore --- src/cj.c | 161 ++++++++++++++++++++++++++----------------------------- 1 file changed, 77 insertions(+), 84 deletions(-) diff --git a/src/cj.c b/src/cj.c index e698d6ae7..16fc0cff3 100644 --- a/src/cj.c +++ b/src/cj.c @@ -1,87 +1,5 @@ #include "data.table.h" -// workhorse for cycling a vector in the correct pattern, -// essentially rep(x, m, each = n) for appropriate m,n at each j -void cycle_vector(SEXP source, SEXP target, int j, int eachrep, int nrow) { - int thislen = LENGTH(source); - int blocklen = thislen*eachrep; - int ncopy = nrow/blocklen; - switch(TYPEOF(source)) { - case LGLSXP: - case INTSXP: { - const int *restrict sourceP = INTEGER(source); - int *restrict targetP = INTEGER(target); - #pragma omp parallel for num_threads(getDTthreads()) if (nrow >= OMP_MIN_VALUE) - // default static schedule so two threads won't write to same cache line in last column - // if they did write to same cache line (and will when last column's thislen is small) there's no correctness issue - for (int i=0; i= OMP_MIN_VALUE) - for (int i=1; i= OMP_MIN_VALUE) - for (int i=0; i= OMP_MIN_VALUE) - for (int i=1; i= OMP_MIN_VALUE) - for (int i=0; i= OMP_MIN_VALUE) - for (int i=1; i Date: Sat, 4 Jan 2020 18:24:27 +0800 Subject: [PATCH 10/19] reduce diff slightly --- src/cj.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/cj.c b/src/cj.c index 16fc0cff3..09b218e28 100644 --- a/src/cj.c +++ b/src/cj.c @@ -5,13 +5,10 @@ SEXP cj(SEXP base_list) { SEXP out = PROTECT(allocVector(VECSXP, ncol)); // start with double to allow overflow double nrow_dbl=1; - int nrow; for (int j=0; j INT_MAX) { + if (nrow_dbl > INT_MAX) error(_("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d"), nrow_dbl, INT_MAX); - } else { - nrow = (int) nrow_dbl; - } + int nrow = (int) nrow_dbl; int eachrep = 1; for (int j=ncol-1; j>=0; --j) { SEXP source = VECTOR_ELT(base_list, j), target; From b2214fe78cfe94e9301e92771c2b5d402cd3bf82 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 Jan 2020 18:32:04 +0800 Subject: [PATCH 11/19] validate input, add comments --- src/cj.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/cj.c b/src/cj.c index 09b218e28..707838814 100644 --- a/src/cj.c +++ b/src/cj.c @@ -98,6 +98,8 @@ SEXP cj(SEXP base_list) { } SEXP unnest(SEXP x, SEXP cols) { + if (!INHERITS(x, char_datatable)) + error(_("Input to unnest must be a data.table")); int k = LENGTH(cols); // nothing to unnest -- need to go further, just be sure to copy if (k == 0) @@ -135,6 +137,11 @@ SEXP unnest(SEXP x, SEXP cols) { int row_counts[n]; + /* unnest the specified cols; each row is expanded with cj, + * then the end result is concatentated with rbindlist. in this way, + * we can let cj handle the crossing logic and rbindlist handle such + * things as type coercion + */ SEXP cj_rowwise = PROTECT(allocVector(VECSXP, n)); for (int i=0; i Date: Sat, 4 Jan 2020 21:02:32 +0800 Subject: [PATCH 12/19] conditional parallelism --- src/cj.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cj.c b/src/cj.c index 707838814..94031a867 100644 --- a/src/cj.c +++ b/src/cj.c @@ -23,7 +23,7 @@ SEXP cj(SEXP base_list) { case INTSXP: { const int *restrict sourceP = INTEGER(source); int *restrict targetP = INTEGER(target); - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads()) if (thislen > OMP_MIN_VALUE) // default static schedule so two threads won't write to same cache line in last column // if they did write to same cache line (and will when last column's thislen is small) there's no correctness issue for (int i=0; i OMP_MIN_VALUE) for (int i=1; i OMP_MIN_VALUE) for (int i=0; i OMP_MIN_VALUE) for (int i=1; i OMP_MIN_VALUE) for (int i=0; i OMP_MIN_VALUE) for (int i=1; i Date: Sat, 4 Jan 2020 22:11:29 +0800 Subject: [PATCH 13/19] fix rownames issue, add man page --- R/wrappers.R | 2 +- man/unnest.Rd | 31 +++++++++++++++++++++++++++++++ src/cj.c | 1 - 3 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 man/unnest.Rd diff --git a/R/wrappers.R b/R/wrappers.R index ce749ad90..f7533d772 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -11,6 +11,6 @@ fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.l colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) coerceFill = function(x) .Call(CcoerceFillR, x) -unnest = function(x, cols = which(vapply_1b(x, is.list))) .Call(Cunnest, x, cols) +unnest = function(x, cols = which(vapply_1b(x, is.list))) setDT(.Call(Cunnest, x, cols))[] testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/man/unnest.Rd b/man/unnest.Rd new file mode 100644 index 000000000..a37afdffa --- /dev/null +++ b/man/unnest.Rd @@ -0,0 +1,31 @@ +\name{unnest} +\alias{unnest} +\title{ Unnest/explode list columns } +\description{ +For tables with non-rectangular columns (i.e., \code{list}), \code{unnest} "stretches" the table by creating a row for each list element, while also preserving the structure of rectangular columns. Akin to \code{EXPLODE} in U-SQL or HiveQL/SparkQL or \code{UNNEST} from Presto or BigQuery, and similar to \code{\link{melt}} -- both reshape "long", but \code{unnest} does so for "ragged" tables more naturally. +} +\usage{ + unnest(x, cols = which(vapply_1b(x, is.list))) +} +\arguments{ + \item{x}{ A \code{data.table} } + \item{cols}{ An \code{integer} vector of column indices of which columns to unnest; defaults to all \code{list} columns. Can be useful for unnesting only a subset of a table's \code{list} columns. Note that non-\code{list} columns are skipped; if there are no \code{list} columns provided, a \code{\link{copy}} of the table is returned. +} +\details{ + By default, when \code{length(cols) > 1L}, a \emph{cartesian unnest} is performed, that is, the cross-product (\emph{a la} \code{\link{CJ}}) of each row's list elements is returned. If there are two columns in \code{cols}, \code{A} and \code{B}, the output will thus have \code{sum(lengths(A) * lengths(B))} rows. +} +\value{ +A \code{data.table} +} +\seealso{ + \code{\link{CJ}}, \code{\link{rbindlist}} +} +\examples{ +x = c(11L, NA, 13L, NA, 15L, NA) +y = c(NA, 12L, 5L, NA, NA, NA) +z = c(11L, NA, 1L, 14L, NA, NA) +fcoalesce(x, y, z) +fcoalesce(list(x,y,z)) # same +fcoalesce(x, list(y,z)) # same +} +\keyword{ data } diff --git a/src/cj.c b/src/cj.c index 94031a867..d3a14a104 100644 --- a/src/cj.c +++ b/src/cj.c @@ -156,7 +156,6 @@ SEXP unnest(SEXP x, SEXP cols) { int out_rows = LENGTH(VECTOR_ELT(unnest_lcols, 0)); SEXP ans = PROTECT(allocVector(VECSXP, p)); - copyMostAttrib(x, ans); for (int j=0, lj=0; j Date: Sat, 4 Jan 2020 22:18:13 +0800 Subject: [PATCH 14/19] bad copypasta --- man/unnest.Rd | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/man/unnest.Rd b/man/unnest.Rd index a37afdffa..d570a530a 100644 --- a/man/unnest.Rd +++ b/man/unnest.Rd @@ -21,11 +21,7 @@ A \code{data.table} \code{\link{CJ}}, \code{\link{rbindlist}} } \examples{ -x = c(11L, NA, 13L, NA, 15L, NA) -y = c(NA, 12L, 5L, NA, NA, NA) -z = c(11L, NA, 1L, 14L, NA, NA) -fcoalesce(x, y, z) -fcoalesce(list(x,y,z)) # same -fcoalesce(x, list(y,z)) # same +x = setDT(list(V1 = 1:2, V2 = 3:4, V3 = list(1:3, 1:2), V4 = list(1L, 1:3))) +unnest(x) } \keyword{ data } From f876968832aa155fb87a430d6bc8cf625c9492c7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 Jan 2020 22:20:03 +0800 Subject: [PATCH 15/19] typo in man --- man/unnest.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/unnest.Rd b/man/unnest.Rd index d570a530a..2a3e7c99c 100644 --- a/man/unnest.Rd +++ b/man/unnest.Rd @@ -9,7 +9,7 @@ For tables with non-rectangular columns (i.e., \code{list}), \code{unnest} "stre } \arguments{ \item{x}{ A \code{data.table} } - \item{cols}{ An \code{integer} vector of column indices of which columns to unnest; defaults to all \code{list} columns. Can be useful for unnesting only a subset of a table's \code{list} columns. Note that non-\code{list} columns are skipped; if there are no \code{list} columns provided, a \code{\link{copy}} of the table is returned. + \item{cols}{ An \code{integer} vector of column indices of which columns to unnest; defaults to all \code{list} columns. Can be useful for unnesting only a subset of a table's \code{list} columns. Note that non-\code{list} columns are skipped; if there are no \code{list} columns provided, a \code{\link{copy}} of the table is returned. } } \details{ By default, when \code{length(cols) > 1L}, a \emph{cartesian unnest} is performed, that is, the cross-product (\emph{a la} \code{\link{CJ}}) of each row's list elements is returned. If there are two columns in \code{cols}, \code{A} and \code{B}, the output will thus have \code{sum(lengths(A) * lengths(B))} rows. From 4cfc5e6cbc2c6ae93af516a9e98b76b2ed195718 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 Jan 2020 23:22:14 +0800 Subject: [PATCH 16/19] rename -> funnest, add first batch of tests --- NAMESPACE | 2 +- R/wrappers.R | 2 +- inst/tests/tests.Rraw | 65 ++++++++++++++++++++++++++++++++++++++++++- man/unnest.Rd | 8 +++--- src/cj.c | 16 ++++++++--- 5 files changed, 82 insertions(+), 11 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 159a3e394..ece31e942 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,7 +56,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) -export(unnest) +export(funnest) S3method("[", data.table) S3method("[<-", data.table) diff --git a/R/wrappers.R b/R/wrappers.R index f7533d772..a19eab598 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -11,6 +11,6 @@ fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.l colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) coerceFill = function(x) .Call(CcoerceFillR, x) -unnest = function(x, cols = which(vapply_1b(x, is.list))) setDT(.Call(Cunnest, x, cols))[] +funnest = function(x, cols = which(vapply_1b(x, is.list))) setDT(.Call(Cunnest, x, cols))[] testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6e1387585..c18f8be93 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12343,7 +12343,7 @@ unlink(f) test(1882.1, .Machine$integer.max, 2147483647L) # same on all platforms and very unlikely to change in R (which is good) test(1882.2, ceiling(.Machine$integer.max^(1/3)), 1291) v = seq_len(1291L) -test(1882.3, CJ(v, v, v), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647") +test(1882.3, CJ(v, v, v), error="Cross product of elements provided for cross-join would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647") # no re-read for particular file, #2509 if (test_R.utils) test(1883, fread(testDir("SA2-by-DJZ.csv.gz"), verbose=TRUE, header=FALSE)[c(1,2,1381,.N),], @@ -16708,6 +16708,69 @@ A = data.table(c1 = 1, c2 = 'asd', c3 = expression(as.character(Sys.time()))) B = data.table(c1 = 3, c2 = 'qwe', c3 = expression(as.character(Sys.time()+5))) test(2129, rbind(A,B)$c3, expression(as.character(Sys.time()), as.character(Sys.time()+5))) +# funnest +x = setDT(list(V1=1:2, V2=3:4, V3=list(1:3, 1:2), V4=list(1L, 1:3))) +ans = data.table( + V1 = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), + V2 = c(3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L), + V3 = c(1L, 2L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), + V4 = c(1L, 1L, 1L, 1L, 2L, 3L, 1L, 2L, 3L) +) +test(2130.01, funnest(x), ans) +x[ , V1 := letters[V1]] +ans[ , V1 := letters[V1]] +test(2130.02, funnest(x), ans) +x[ , V1 := factor(V1)] +ans[ , V1 := factor(V1)] +test(2130.03, funnest(x), ans) + +x[ , e := expression(1, 2)] +test(2130.04, funnest(x), error='Unsupported column type') +x[ , e := NULL] + +x[ , c('r', 'z') := .(as.raw(0), 0+1i)] +ans[ , c('r', 'z') := .(as.raw(0), 0+1i)] +test(2130.05, funnest(x), ans) +x[ , c('r', 'z') := NULL] +ans[ , c('r', 'z') := NULL] + +x[ , V3 := .(lapply(V3, function(i) letters[i]))] +ans[ , V3 := letters[V3]] +test(2130.06, funnest(x), ans) + +x[ , V3 := .(lapply(V3, factor))] +ans[ , V3 := factor(V3)] +test(2130.07, funnest(x), ans) + +x[1L, V3 := .(list(expression(1)))] +test(2130.08, funnest(x), error="Type 'expression' not supported") + +x[1L, V3 := .(list(1:3))] +ans[1:3, V3 := factor(1:3)] +ans[ , V3 := factor(V3, levels = c('1', '2', '3', 'a', 'b'))] +test(2130.09, funnest(x), ans) + +x[2L, V3 := .(list(c('a', 'b')))] +ans[ , V3 := as.character(V3)] +test(2130.10, funnest(x), ans) + +ans = unique(ans[ , !'V4'])[ , V4 := .(rep(x$V4, 3:2))] +test(2131.11, funnest(x, cols=3L), ans) +test(2131.12, funnest(x, cols=2:3), ans) +test(2131.13, funnest(x, cols='a'), error='cols must be an integer vector, got') +test(2131.14, funnest(x, cols=10L), error='cols to unnest must be in [1, ncol(x)=4]') +test(2131.15, funnest(x, cols=1L), x) +test(2131.16, address(funnest(x, cols=1L)) != address(x)) + +x[ , V4 := NULL] +ans[ , V4 := NULL] +ans = unique(ans) +test(2130.15, funnest(x), ans) + +test(2130.16, funnest(1), error='Input to funnest must be a data.table') +x = data.table(a=1) +test(2130.17, funnest(x), x) +test(2130.18, address(funnest(x)) != address(x)) ######################## # Add new tests here # diff --git a/man/unnest.Rd b/man/unnest.Rd index 2a3e7c99c..16135de86 100644 --- a/man/unnest.Rd +++ b/man/unnest.Rd @@ -1,11 +1,11 @@ -\name{unnest} -\alias{unnest} +\name{funnest} +\alias{funnest} \title{ Unnest/explode list columns } \description{ -For tables with non-rectangular columns (i.e., \code{list}), \code{unnest} "stretches" the table by creating a row for each list element, while also preserving the structure of rectangular columns. Akin to \code{EXPLODE} in U-SQL or HiveQL/SparkQL or \code{UNNEST} from Presto or BigQuery, and similar to \code{\link{melt}} -- both reshape "long", but \code{unnest} does so for "ragged" tables more naturally. +For tables with non-rectangular columns (i.e., \code{list}), \code{funnest} "stretches" the table by creating a row for each list element, while also preserving the structure of rectangular columns. Akin to \code{EXPLODE} in U-SQL or HiveQL/SparkQL or \code{UNNEST} from Presto or BigQuery, and similar to \code{\link{melt}} -- both reshape "long", but \code{funnest} does so for "ragged" tables more naturally. } \usage{ - unnest(x, cols = which(vapply_1b(x, is.list))) + funnest(x, cols = which(vapply_1b(x, is.list))) } \arguments{ \item{x}{ A \code{data.table} } diff --git a/src/cj.c b/src/cj.c index d3a14a104..f05675482 100644 --- a/src/cj.c +++ b/src/cj.c @@ -7,7 +7,7 @@ SEXP cj(SEXP base_list) { double nrow_dbl=1; for (int j=0; j INT_MAX) - error(_("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d"), nrow_dbl, INT_MAX); + error(_("Cross product of elements provided for cross-join would result in %.0f rows which exceeds .Machine$integer.max == %d"), nrow_dbl, INT_MAX); int nrow = (int) nrow_dbl; int eachrep = 1; for (int j=ncol-1; j>=0; --j) { @@ -89,7 +89,7 @@ SEXP cj(SEXP base_list) { } } break; default: - error(_("Type '%s' not supported by CJ."), type2char(TYPEOF(source))); + error(_("Type '%s' not supported by cross-join."), type2char(TYPEOF(source))); } eachrep *= thislen; } @@ -99,7 +99,7 @@ SEXP cj(SEXP base_list) { SEXP unnest(SEXP x, SEXP cols) { if (!INHERITS(x, char_datatable)) - error(_("Input to unnest must be a data.table")); + error(_("Input to funnest must be a data.table")); int k = LENGTH(cols); // nothing to unnest -- need to go further, just be sure to copy if (k == 0) @@ -213,7 +213,15 @@ SEXP unnest(SEXP x, SEXP cols) { outi += row_counts[i]; } } break; - default: error(_("Unsupported column type %s"), type2char(TYPEOF(xj))); + case VECSXP: { + for (int i=0; i Date: Sat, 4 Jan 2020 23:51:58 +0800 Subject: [PATCH 17/19] tests numbers --- inst/tests/tests.Rraw | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2b445120b..b59227fdd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16783,12 +16783,12 @@ test(2131.16, address(funnest(x, cols=1L)) != address(x)) x[ , V4 := NULL] ans[ , V4 := NULL] ans = unique(ans) -test(2131.15, funnest(x), ans) +test(2131.17, funnest(x), ans) -test(2131.16, funnest(1), error='Input to funnest must be a data.table') +test(2131.18, funnest(1), error='Input to funnest must be a data.table') x = data.table(a=1) -test(2131.17, funnest(x), x) -test(2131.18, address(funnest(x)) != address(x)) +test(2131.19, funnest(x), x) +test(2131.20, address(funnest(x)) != address(x)) ######################## # Add new tests here # From fbbccd2417256576d250f0cd9fc579e475d818e5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 5 Jan 2020 00:28:22 +0800 Subject: [PATCH 18/19] missed rename in man --- man/unnest.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/unnest.Rd b/man/unnest.Rd index 16135de86..67998269e 100644 --- a/man/unnest.Rd +++ b/man/unnest.Rd @@ -22,6 +22,6 @@ A \code{data.table} } \examples{ x = setDT(list(V1 = 1:2, V2 = 3:4, V3 = list(1:3, 1:2), V4 = list(1L, 1:3))) -unnest(x) +funnest(x) } \keyword{ data } From d161dfd122ed037ffae37b20493e6df453ed9941 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 5 Jan 2020 00:49:14 +0800 Subject: [PATCH 19/19] coverage; fix test --- inst/tests/tests.Rraw | 7 +++++-- src/cj.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b59227fdd..fe83921ec 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16727,10 +16727,10 @@ DT = data.table( test(2130.03, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) # funnest -x = setDT(list(V1=1:2, V2=3:4, V3=list(1:3, 1:2), V4=list(1L, 1:3))) +x = setDT(list(V1=1:2, V2=c(3,4), V3=list(1:3, 1:2), V4=list(1L, 1:3))) ans = data.table( V1 = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), - V2 = c(3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L), + V2 = c(3, 3, 3, 4, 4, 4, 4, 4, 4), V3 = c(1L, 2L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), V4 = c(1L, 1L, 1L, 1L, 2L, 3L, 1L, 2L, 3L) ) @@ -16790,6 +16790,9 @@ x = data.table(a=1) test(2131.19, funnest(x), x) test(2131.20, address(funnest(x)) != address(x)) +x[ , e := expression(2)] +test(2131.21, funnest(x, cols=2L), error='Unsupported type for unnesting') + ######################## # Add new tests here # ######################## diff --git a/src/cj.c b/src/cj.c index f05675482..8702a929f 100644 --- a/src/cj.c +++ b/src/cj.c @@ -129,7 +129,7 @@ SEXP unnest(SEXP x, SEXP cols) { lcols[lk++] = j-1; // move to 0-based } break; default: - error(_("Unsupported type for unnesting: %s"), type2char(TYPEOF(VECTOR_ELT(x, j)))); + error(_("Unsupported type for unnesting: '%s'"), type2char(TYPEOF(VECTOR_ELT(x, j-1)))); } } if (lk == 0)