Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce fexplode function #4156

Draft
wants to merge 20 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export(nafill)
export(setnafill)
export(.Last.updated)
export(fcoalesce)
export(unnest)

S3method("[", data.table)
S3method("[<-", data.table)
Expand Down
2 changes: 2 additions & 0 deletions R/wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@ fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.l
colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups)
coerceFill = function(x) .Call(CcoerceFillR, x)

unnest = function(x) .Call(Cunnest, x)

testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
3 changes: 3 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,9 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP
// coalesce.c
SEXP coalesce(SEXP x, SEXP inplace);

// unnest.c
SEXP unnest(SEXP x);

// utils.c
bool isRealReallyInt(SEXP x);
SEXP isReallyReal(SEXP x);
Expand Down
2 changes: 2 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ SEXP lock();
SEXP unlock();
SEXP islockedR();
SEXP allNAR();
SEXP unnest();

// .Externals
SEXP fastmean();
Expand Down Expand Up @@ -211,6 +212,7 @@ R_CallMethodDef callMethods[] = {
{"CfrollapplyR", (DL_FUNC) &frollapplyR, -1},
{"CtestMsgR", (DL_FUNC) &testMsgR, -1},
{"C_allNAR", (DL_FUNC) &allNAR, -1},
{"Cunnest", (DL_FUNC) &unnest, -1},
{NULL, NULL, 0}
};

Expand Down
226 changes: 226 additions & 0 deletions src/unnest.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
#include "data.table.h"
#include <Rdefines.h>

SEXP unnest(SEXP x) {
MichaelChirico marked this conversation as resolved.
Show resolved Hide resolved
int n = LENGTH(VECTOR_ELT(x, 0));
int p = LENGTH(x);
int row_counts[n];
SEXPTYPE col_types[p];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could segfault for many rows or columns

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would also be obviated by switch to cols...

The issue here is if we try to instantiate SEXPTYPE array with >INT_MAX elements?


bool all_atomic = true;

for (int j=0; j<p; j++) {
SEXPTYPE this_col = TYPEOF(VECTOR_ELT(x, j));
switch(this_col) {
case VECSXP :
// don't break; drop through to record col_types
jangorecki marked this conversation as resolved.
Show resolved Hide resolved
all_atomic = false;
case RAWSXP :
case LGLSXP :
case INTSXP :
case REALSXP :
case CPLXSXP :
case STRSXP :
col_types[j] = this_col;
break;
default:
error(_("Unsupported type: %s"), type2char(this_col));
}
}

// no need to go further, just be sure to copy
if (all_atomic) {
return (duplicate(x));
}

double this_row;
SEXP xj;
// find the mapping i -> # rows corresponding to i in output
for (int i=0; i<n; i++) {
this_row = 1;
for (int j=0; j<p; j++) {
xj = VECTOR_ELT(x, j);
switch(col_types[j]) {
case RAWSXP :
case LGLSXP :
case INTSXP :
case REALSXP :
case CPLXSXP :
case STRSXP : break;
case VECSXP :
this_row *= LENGTH(VECTOR_ELT(xj, i));
break;
default:
error(_("Unsupported type: %s"), type2char(col_types[j]));
}
}
if (this_row > INT_MAX)
error("Implied number of unnested rows from row %d (%.0f) exceeds %d, the maximum currently allowed.", i, this_row, INT_MAX);
row_counts[i] = (int) this_row;
}

int out_rows=0;
int i=0;
while (i < n) {
if (out_rows > INT_MAX - row_counts[i]) {
Rprintf("out_rows=%d; row_counts[i]=%d\n", out_rows, row_counts[i]);
double out_rows_dbl = (double) out_rows;
for (;i<n; i++) out_rows_dbl += row_counts[i];
MichaelChirico marked this conversation as resolved.
Show resolved Hide resolved
error("Implied number of unnested rows (%.0f) exceeds %d, the maximum currently allowed.", out_rows_dbl, INT_MAX);
}
out_rows += row_counts[i++];
}

// TODO: factor columns?
SEXP ans = PROTECT(allocVector(VECSXP, p));
for (int j=0; j<p; j++) {
xj = VECTOR_ELT(x, j);
int outi=0;
SEXP tmp;
switch(col_types[j]) {
case RAWSXP: {
tmp = PROTECT(allocVector(RAWSXP, out_rows));
Rbyte *tmpp = RAW(tmp);
Rbyte *xjp = RAW(xj);
for (int i=0; i<n; i++) {
for (int repi=0; repi<row_counts[i]; repi++) {
tmpp[outi++] = xjp[i];
}
}
} break;
case LGLSXP:
case INTSXP: {
tmp = PROTECT(allocVector(TYPEOF(xj), out_rows));
int *tmpp = INTEGER(tmp);
int *xjp = INTEGER(xj);
for (int i=0; i<n; i++) {
for (int repi=0; repi<row_counts[i]; repi++) {
tmpp[outi++] = xjp[i];
}
}
} break;
case REALSXP: {
tmp = PROTECT(allocVector(REALSXP, out_rows));
double *tmpp = REAL(tmp);
double *xjp = REAL(xj);
for (int i=0; i<n; i++) {
for (int repi=0; repi<row_counts[i]; repi++) {
tmpp[outi++] = xjp[i];
}
}
} break;
case CPLXSXP: {
tmp = PROTECT(allocVector(CPLXSXP, out_rows));
Rcomplex *tmpp = COMPLEX(tmp);
Rcomplex *xjp = COMPLEX(xj);
for (int i=0; i<n; i++) {
for (int repi=0; repi<row_counts[i]; repi++) {
tmpp[outi++] = xjp[i];
}
}
} break;
case STRSXP: {
tmp = PROTECT(allocVector(STRSXP, out_rows));
for (int i=0; i<n; i++) {
for (int repi=0; repi<row_counts[i]; repi++) {
SET_STRING_ELT(tmp, outi++, STRING_ELT(xj, i));
}
}
} break;
case VECSXP: {
// TODO: type bumping for mismatch
SEXP xj0 = VECTOR_ELT(xj, 0);
tmp = PROTECT(allocVector(TYPEOF(xj0), out_rows));
switch(TYPEOF(xj0)) {
case RAWSXP: {
Rbyte *tmpp = RAW(tmp);
for (int i=0; i<n; i++) {
SEXP xji = VECTOR_ELT(xj, i);
Rbyte *xjip = RAW(xji);
for (int k=0; k<LENGTH(xji); k++) {
for (int repi=0; repi<row_counts[i]/LENGTH(xji); repi++) {
tmpp[outi++] = xjip[k];
}
}
}
} break;
case LGLSXP:
case INTSXP: {
int *tmpp = INTEGER(tmp);
for (int i=0; i<n; i++) {
SEXP xji = VECTOR_ELT(xj, i);
int *xjip = INTEGER(xji);
for (int k=0; k<LENGTH(xji); k++) {
for (int repi=0; repi<row_counts[i]/LENGTH(xji); repi++) {
tmpp[outi++] = xjip[k];
}
}
}
} break;
case REALSXP: {
double *tmpp = REAL(tmp);
for (int i=0; i<n; i++) {
SEXP xji = VECTOR_ELT(xj, i);
double *xjip = REAL(xji);
for (int k=0; k<LENGTH(xji); k++) {
for (int repi=0; repi<row_counts[i]/LENGTH(xji); repi++) {
tmpp[outi++] = xjip[k];
}
}
}
} break;
case CPLXSXP: {
Rcomplex *tmpp = COMPLEX(tmp);
for (int i=0; i<n; i++) {
SEXP xji = VECTOR_ELT(xj, i);
Rcomplex *xjip = COMPLEX(xji);
for (int k=0; k<LENGTH(xji); k++) {
for (int repi=0; repi<row_counts[i]/LENGTH(xji); repi++) {
tmpp[outi++] = xjip[k];
}
}
}
} break;
case STRSXP: {
for (int i=0; i<n; i++) {
SEXP xji = VECTOR_ELT(xj, i);
for (int k=0; k<LENGTH(xji); k++) {
for (int repi=0; repi<row_counts[i]/LENGTH(xji); repi++) {
SET_STRING_ELT(tmp, outi++, STRING_ELT(xji, k));
}
}
}
} break;
case VECSXP: {
for (int i=0; i<n; i++) {
SEXP xji = VECTOR_ELT(xj, i);
for (int k=0; k<LENGTH(xji); k++) {
// TODO: this isn't right -- we need to use logic like
// cj.c to rep in phases; here, same pattern is applied
// to each column -> wrong output
for (int repi=0; repi<row_counts[i]/LENGTH(xji); repi++) {
SET_VECTOR_ELT(tmp, outi++, VECTOR_ELT(xji, k));
}
}
}
} break;
default: error("Invalid nested column type %s in column %d", type2char(TYPEOF(xj0)), p);
}
} break;
default: error("Internal error: invalid column type %s should have been caught earlier in unnest", type2char(col_types[j]));
}
SET_VECTOR_ELT(ans, j, tmp);
UNPROTECT(1);
}

copyMostAttrib(x, ans);
// copy names
SEXP ansNames;
SEXP xNames = PROTECT(getAttrib(x, R_NamesSymbol));
setAttrib(ans, R_NamesSymbol, ansNames=allocVector(STRSXP, p));
for (int j=0; j<p; j++) {
SET_STRING_ELT(ansNames, j, STRING_ELT(xNames, j));
}
UNPROTECT(2);
return (ans);
}