Skip to content

Commit

Permalink
[[ by group performance (Rdatatable#4655)
Browse files Browse the repository at this point in the history
  • Loading branch information
ColeMiller1 authored and hongyuanjia committed Sep 2, 2023
1 parent a342e75 commit 565009b
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 36 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux.

3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed.

## NOTES

1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` suggests `bit64` but does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be improved to include suggests in reverse dependency testing.
Expand Down
11 changes: 11 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -17127,3 +17127,14 @@ test(2151, dcast(DT, 1 ~ a, value.var='survmean'), data.table('.'='.', s=1L, x=2
y = person(given='Joel', family='Mossong')
test(2152, copy(y), y)

# .N and .GRP special statics copied correctly when placed as a vector in a list column; part of PR#4655
# see comments in anySpecialStatic() at the top of dogroups.c
# .SD, .I and .BY are covered by previous tests
DT = data.table(x=c(1L,2L,2L), y=1:3)
test(2153.1, DT[, .(list(.N)), by=x], data.table(x=1:2, V1=as.list(1:2)))
test(2153.2, DT[, .(list(.GRP)), by=x], data.table(x=1:2, V1=as.list(1:2)))
test(2153.3, ans<-DT[, .(list(.NGRP)), by=x], data.table(x=1:2, V1=list(2L,2L)))
test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't change group to group so the same object can be referenced many times unlike .N and .GRP
test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result
data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L))))

24 changes: 0 additions & 24 deletions src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -670,13 +670,6 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
return(dt); // needed for `*tmp*` mechanism (when := isn't used), and to return the new object after a := for compound syntax.
}

static bool anyNamed(SEXP x) {
if (MAYBE_REFERENCED(x)) return true;
if (isNewList(x)) for (int i=0; i<LENGTH(x); i++)
if (anyNamed(VECTOR_ELT(x,i))) return true;
return false;
}

#define MSGSIZE 1000
static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects

Expand Down Expand Up @@ -704,23 +697,6 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
error(_("Internal error: memrecycle has received NULL colname")); // # nocov
*memrecycle_message = '\0';
int protecti=0;
if (isNewList(source)) {
// A list() column; i.e. target is a column of pointers to SEXPs rather than the more common case of numbers in an atomic vector.
// If any item within the list is NAMED then take a fresh copy. So far this has occurred from dogroups.c when
// j returns .BY or similar specials as-is within a list(). Those specials are static inside
// dogroups so if we don't copy now the last value written to them by dogroups becomes repeated in the result;
// i.e. the wrong result.
// If source is itself recycled later (many list() column items pointing to the same object) we are ok with that
// since we now have a fresh copy and := will not assign with a list() column's cell value; := only changes the
// SEXP pointed to.
// If source is already not named (because j already created a fresh unnamed vector within a list()) we don't want to
// duplicate unnecessarily, hence checking for named rather than duplicating always.
// See #481, #1270 and tests 1341.* fail without this copy.
// ********** This might go away now that we copy properly in dogroups.c **********
if (anyNamed(source)) {
source = PROTECT(copyAsPlain(source)); protecti++;
}
}
const bool sourceIsFactor=isFactor(source), targetIsFactor=isFactor(target);
const bool sourceIsI64=isReal(source) && Rinherits(source, char_integer64);
const bool targetIsI64=isReal(target) && Rinherits(target, char_integer64);
Expand Down
100 changes: 88 additions & 12 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,54 @@
#include <fcntl.h>
#include <time.h>

static bool anySpecialStatic(SEXP x) {
// Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd
// Static because these are like C static arrays which are the same memory for each group; e.g., dogroups
// creates .SD for the largest group once up front, overwriting the contents for each group. Their
// value changes across group but not their memory address. (.NGRP is also special static but its value
// is constant across groups so that's excluded here.)
// This works well, other than a relatively rare case when two conditions are both true :
// 1) the j expression returns a group column as-is without doing any aggregation
// 2) that result is placed in a list column result
// The list column result can then incorrectly contain the result for the last group repeated for all
// groups because the list column ends up holding a pointer to these special static vectors.
// See test 2153, and to illustrate here, consider a simplified test 1341
// > DT
// x y
// <int> <int>
// 1: 1 1
// 2: 2 2
// 3: 1 3
// 4: 2 4
// > DT[, .(list(y)), by=x]
// x V1
// <int> <list>
// 1: 1 2,4 # should be 1,3
// 2: 2 2,4
//
// This has been fixed for a decade but the solution has changed over time.
//
// We don't wish to inspect the j expression for these cases because there are so many; e.g. user defined functions.
// A special symbol does not need to appear in j for the problem to occur. Using a member of .SD is enough as the example above illustrates.
// Using R's own reference counting could invoke too many unnecessary copies because these specials are routinely referenced.
// Hence we mark these specials (SD, BY, I) here in dogroups and if j's value is being assigned to a list column, we check to
// see if any specials are present and copy them if so.
// This keeps the special logic in one place in one file here. Previously this copy was done by memrecycle in assign.c but then
// with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed.
// Currently the marker is negative truelength. These specials are protected by us here and before we release them
// we restore the true truelength for when R starts to use vector truelength.
if (isVectorAtomic(x))
return TRUELENGTH(x)<0;
if (isNewList(x)) for (int i=0; i<LENGTH(x); ++i) {
if (anySpecialStatic(VECTOR_ELT(x,i)))
return true;
}
return false;
}

SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg)
{
R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp, origIlen=0, origSDnrow=0;
R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp;
int nprotect=0;
SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source;
Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE;
Expand Down Expand Up @@ -37,15 +82,18 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
SET_STRING_ELT(bynames, i, STRING_ELT(getAttrib(groups,R_NamesSymbol), j));
defineVar(install(CHAR(STRING_ELT(bynames,i))), VECTOR_ELT(BY,i), env); // by vars can be used by name in j as well as via .BY
if (SIZEOF(VECTOR_ELT(BY,i))==0)
error(_("Internal error: unsupported size-0 type '%s' in column %d of 'by' should have been caught earlier"), type2char(TYPEOF(VECTOR_ELT(BY, i))), i+1); // #nocov
error(_("Internal error: unsupported size-0 type '%s' in column %d of 'by' should have been caught earlier"), type2char(TYPEOF(VECTOR_ELT(BY, i))), i+1); // # nocov
SET_TRUELENGTH(VECTOR_ELT(BY,i), -1); // marker for anySpecialStatic(); see its comments
}
setAttrib(BY, R_NamesSymbol, bynames); // Fix for #42 - BY doesn't retain names anymore
R_LockBinding(sym_BY, env);
if (isNull(jiscols) && (length(bynames)!=length(groups) || length(bynames)!=length(grpcols))) error(_("!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]"),length(bynames),length(groups),length(grpcols));
// TO DO: check this check above.

N = PROTECT(findVar(install(".N"), env)); nprotect++; // PROTECT for rchk
SET_TRUELENGTH(N, -1); // marker for anySpecialStatic(); see its comments
GRP = PROTECT(findVar(install(".GRP"), env)); nprotect++;
SET_TRUELENGTH(GRP, -1); // marker for anySpecialStatic(); see its comments
iSD = PROTECT(findVar(install(".iSD"), env)); nprotect++; // 1-row and possibly no cols (if no i variables are used via JIS)
xSD = PROTECT(findVar(install(".xSD"), env)); nprotect++;
R_len_t maxGrpSize = 0;
Expand All @@ -54,6 +102,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
if (ilens[i] > maxGrpSize) maxGrpSize = ilens[i];
}
defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++;
SET_TRUELENGTH(I, -maxGrpSize); // marker for anySpecialStatic(); see its comments
R_LockBinding(install(".I"), env);

SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by"
Expand All @@ -69,23 +118,25 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
SEXP names = PROTECT(getAttrib(SDall, R_NamesSymbol)); nprotect++;
if (length(names) != length(SDall)) error(_("length(names)!=length(SD)"));
SEXP *nameSyms = (SEXP *)R_alloc(length(names), sizeof(SEXP));

for(int i=0; i<length(SDall); ++i) {
if (SIZEOF(VECTOR_ELT(SDall, i))==0)
error(_("Internal error: size-0 type %d in .SD column %d should have been caught earlier"), TYPEOF(VECTOR_ELT(SDall, i)), i); // #nocov
SEXP this = VECTOR_ELT(SDall, i);
if (SIZEOF(this)==0)
error(_("Internal error: size-0 type %d in .SD column %d should have been caught earlier"), TYPEOF(this), i); // # nocov
if (LENGTH(this) != maxGrpSize)
error(_("Internal error: SDall %d length = %d != %d"), i+1, LENGTH(this), maxGrpSize); // # nocov
nameSyms[i] = install(CHAR(STRING_ELT(names, i)));
// fixes http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by
copyMostAttrib(VECTOR_ELT(dt,INTEGER(dtcols)[i]-1), VECTOR_ELT(SDall,i)); // not names, otherwise test 778 would fail
copyMostAttrib(VECTOR_ELT(dt,INTEGER(dtcols)[i]-1), this); // not names, otherwise test 778 would fail
SET_TRUELENGTH(this, -maxGrpSize); // marker for anySpecialStatic(); see its comments
}

origIlen = length(I); // test 762 has length(I)==1 but nrow(SD)==0
if (length(SDall)) origSDnrow = length(VECTOR_ELT(SDall, 0));

SEXP xknames = PROTECT(getAttrib(xSD, R_NamesSymbol)); nprotect++;
if (length(xknames) != length(xSD)) error(_("length(xknames)!=length(xSD)"));
SEXP *xknameSyms = (SEXP *)R_alloc(length(xknames), sizeof(SEXP));
for(int i=0; i<length(xSD); ++i) {
if (SIZEOF(VECTOR_ELT(xSD, i))==0)
error(_("Internal error: type %d in .xSD column %d should have been caught by now"), TYPEOF(VECTOR_ELT(xSD, i)), i); // #nocov
error(_("Internal error: type %d in .xSD column %d should have been caught by now"), TYPEOF(VECTOR_ELT(xSD, i)), i); // # nocov
xknameSyms[i] = install(CHAR(STRING_ELT(xknames, i)));
}

Expand Down Expand Up @@ -169,7 +220,8 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
}
if (verbose) { tblock[0] += clock()-tstart; nblock[0]++; }
} else {
for (int k=0; k<grpn; ++k) iI[k] = iorder[ istarts[i]-1 + k ];
const int rownum = istarts[i]-1;
for (int k=0; k<grpn; ++k) iI[k] = iorder[rownum+k];
for (int j=0; j<length(SDall); ++j) {
// this is the main non-contiguous gather, and is parallel (within-column) for non-SEXP
subsetVectorRaw(VECTOR_ELT(SDall,j), VECTOR_ELT(dt,INTEGER(dtcols)[j]-1), I, /*anyNA=*/false);
Expand Down Expand Up @@ -239,8 +291,14 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol));
copyMostAttrib(RHS, target); // attributes of first group dominate; e.g. initial factor levels come from first group
}
bool copied = false;
if (isNewList(target) && anySpecialStatic(RHS)) { // see comments in anySpecialStatic()
RHS = PROTECT(duplicate(RHS));
copied = true;
}
const char *warn = memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS, 0, -1, 0, "");
// can't error here because length mismatch already checked for all jval columns before starting to add any new columns
if (copied) UNPROTECT(1);
if (warn)
warning(_("Group %d column '%s': %s"), i+1, CHAR(STRING_ELT(dtnames, colj)), warn);
}
Expand Down Expand Up @@ -338,7 +396,13 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
if (thislen>1 && thislen!=maxn && grpn>0) { // grpn>0 for grouping empty tables; test 1986
error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn);
}
bool copied = false;
if (isNewList(target) && anySpecialStatic(source)) { // see comments in anySpecialStatic()
source = PROTECT(duplicate(source));
copied = true;
}
memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, "");
if (copied) UNPROTECT(1);
}
}
ansloc += maxn;
Expand All @@ -358,8 +422,20 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
}
} else ans = R_NilValue;
// Now reset length of .SD columns and .I to length of largest group, otherwise leak if the last group is smaller (often is).
for (int j=0; j<length(SDall); ++j) SETLENGTH(VECTOR_ELT(SDall,j), origSDnrow);
SETLENGTH(I, origIlen);
// Also reset truelength on specials; see comments in anySpecialStatic().
for (int j=0; j<length(SDall); ++j) {
SEXP this = VECTOR_ELT(SDall,j);
SETLENGTH(this, maxGrpSize);
SET_TRUELENGTH(this, maxGrpSize);
}
SETLENGTH(I, maxGrpSize);
SET_TRUELENGTH(I, maxGrpSize);
for (int i=0; i<length(BY); ++i) {
SEXP this = VECTOR_ELT(BY, i);
SET_TRUELENGTH(this, length(this)); // might be 0 or 1; see its allocVector above
}
SET_TRUELENGTH(N, 1);
SET_TRUELENGTH(GRP, 1);
if (verbose) {
if (nblock[0] && nblock[1]) error(_("Internal error: block 0 [%d] and block 1 [%d] have both run"), nblock[0], nblock[1]); // # nocov
int w = nblock[1]>0;
Expand Down

0 comments on commit 565009b

Please sign in to comment.