-
Notifications
You must be signed in to change notification settings - Fork 990
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[cbindlist/mergelist] Implement cbindlist #6435
base: cbind-merge-list-utils
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
cbindlist = function(l, copy=TRUE) { | ||
ans = .Call(Ccbindlist, l, copy) | ||
if (anyDuplicated(names(ans))) { ## invalidate key and index | ||
setattr(ans, "sorted", NULL) | ||
setattr(ans, "index", integer()) | ||
} | ||
setDT(ans) | ||
ans | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
require(methods) | ||
|
||
if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { | ||
if ((tt<-compiler::enableJIT(-1))>0) | ||
cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") | ||
} else { | ||
require(data.table) | ||
test = data.table:::test | ||
} | ||
|
||
addresses = function(x) vapply(x, address, "") | ||
|
||
# cbindlist | ||
|
||
l = list( | ||
d1 = data.table(x=1:3, v1=1L), | ||
d2 = data.table(y=3:1, v2=2L), | ||
d3 = data.table(z=2:4, v3=3L) | ||
) | ||
ans = cbindlist(l) | ||
expected = data.table(l$d1, l$d2, l$d3) | ||
test(11.01, ans, expected) | ||
test(11.02, intersect(addresses(ans), addresses(expected)), character()) | ||
ans = cbindlist(l, copy=FALSE) | ||
expected = setDT(c(l$d1, l$d2, l$d3)) | ||
test(11.03, ans, expected) | ||
test(11.04, length(intersect(addresses(ans), addresses(expected))), ncol(expected)) | ||
test(11.05, cbindlist(list(data.table(a=1L), data.table(), data.table(d=2L), data.table(f=3L))), data.table(a=1L,d=2L,f=3L)) | ||
rm(expected) | ||
## codecov | ||
test(12.01, cbindlist(data.frame(a=1L), data.frame(b=1L)), error="must be a list") | ||
test(12.02, cbindlist(TRUE, FALSE), error="must be a list") | ||
test(12.03, cbindlist(list(), NA), error="must be TRUE or FALSE") | ||
test(12.04, cbindlist(list(data.table(a=1L), 1L)), error="is not of data.table type") | ||
test(12.05, options = c(datatable.verbose=TRUE), cbindlist(list(data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2), output="cbindlist.*took") | ||
test(12.06, cbindlist(list(data.table(), data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2)) | ||
test(12.07, cbindlist(list(data.table(), data.table(a=1:2), list(b=1:2))), data.table(a=1:2, b=1:2)) | ||
test(12.08, cbindlist(list(data.table(a=integer()), list(b=integer()))), data.table(a=integer(), b=integer())) | ||
## duplicated names | ||
test(12.09, cbindlist(list(data.table(a=1L, b=2L), data.table(b=3L, d=4L))), data.table(a=1L, b=2L, b=3L, d=4L)) | ||
ans = cbindlist(list(setindexv(data.table(a=2:1, b=1:2),"a"), data.table(a=1:2, b=2:1, key="a"), data.table(a=2:1, b=1:2))) | ||
test(12.10, ans, data.table(a=2:1, b=1:2, a=1:2, b=2:1, a=2:1, b=1:2)) | ||
test(12.11, indices(ans), NULL) | ||
## recycling, first ensure cbind recycling that we want to match to | ||
test(12.12, cbind(data.table(x=integer()), data.table(a=1:2)), data.table(x=c(NA_integer_,NA), a=1:2)) | ||
test(12.13, cbind(data.table(x=1L), data.table(a=1:2)), data.table(x=c(1L,1L), a=1:2)) | ||
test(12.14, cbindlist(list(data.table(a=integer()), data.table(b=1:2))), error="recycling.*not yet implemented") | ||
test(12.15, cbindlist(list(data.table(a=1L), data.table(b=1:2))), error="recycling.*not yet implemented") | ||
test(12.16, cbindlist(list(data.table(a=integer()), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow") | ||
test(12.17, cbindlist(list(data.table(a=1L), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow") | ||
|
||
## retain indices | ||
d = data.table(x=1:2, y=2:1, z=2:1, v1=1:2) ## ensure setDT will retain key and indices when it is called on the list, bc Ccbindlist returns list | ||
setkeyv(d, "x"); setindexv(d, list("y", "z")) | ||
a = attributes(d) | ||
attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")] | ||
test(13.01, class(d), "list") | ||
setDT(d) | ||
test(13.02, key(d), "x") | ||
# test(13.03, hasindex(d, "y") && hasindex(d, "z")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing test for case when keyed entry is not the first input There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See added testcase below |
||
l = list( | ||
data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5), | ||
data.table(id4=5:1, id5=1:5, v2=1:5), | ||
data.table(id6=5:1, id7=1:5, v3=1:5), | ||
data.table(id8=5:1, id9=5:1, v4=1:5) | ||
) | ||
setkeyv(l[[1L]], "id1"); setindexv(l[[1L]], list("id1", "id2", "id3", c("id1","id2","id3"))); setindexv(l[[3L]], list("id6", "id7")); setindexv(l[[4L]], "id9") | ||
ii = lapply(l, indices) | ||
ans = cbindlist(l) | ||
test(13.04, key(ans), "id1") | ||
test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9")) | ||
test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
[Re: line +73]
```gt_suggestion ans = cbindlist(list(data.table(a=1:2), data.table(b=3:4, key="b")))
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,36 @@ | ||||||||||||||||||||||||||||||
\name{cbindlist} | ||||||||||||||||||||||||||||||
\alias{cbindlist} | ||||||||||||||||||||||||||||||
\alias{cbind} | ||||||||||||||||||||||||||||||
\alias{cbind.data.table} | ||||||||||||||||||||||||||||||
\title{Column bind multiple data.tables} | ||||||||||||||||||||||||||||||
\description{ | ||||||||||||||||||||||||||||||
Column bind multiple \code{data.table}s. | ||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
\usage{ | ||||||||||||||||||||||||||||||
cbindlist(l, copy=TRUE) | ||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
\arguments{ | ||||||||||||||||||||||||||||||
\item{l}{ \code{list} of \code{data.table}s to merge. } | ||||||||||||||||||||||||||||||
\item{copy}{ \code{logical}, decides if columns has to be copied into resulting object (default) or just referred. } | ||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this description is not clear to me, and there's no example |
||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
\details{ | ||||||||||||||||||||||||||||||
Column bind only stacks input elements. Works like \code{\link{data.table}}, but takes \code{list} type on input. Zero-column tables in \code{l} are omitted. Tables in \code{l} should have matching row count; recycling of length-1 rows is not yet implemented. Indices of the input tables are transferred to the resulting table, as well as the \emph{key} of the first keyed table. | ||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
\value{ | ||||||||||||||||||||||||||||||
A new \code{data.table} based on the stacked objects. Eventually when \code{copy} is \code{FALSE}, then resulting object will share columns with \code{l} tables. | ||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
\note{ | ||||||||||||||||||||||||||||||
If output object has any duplicate names, then key and indices are removed. | ||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does that also apply if the duplicates are not among the key/index columns? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes and this seems the save approach, since |
||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
\seealso{ | ||||||||||||||||||||||||||||||
\code{\link{data.table}}, \code{\link{rbindlist}} | ||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
\examples{ | ||||||||||||||||||||||||||||||
l = list( | ||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add more examples, e.g. |
||||||||||||||||||||||||||||||
d1 = data.table(x=1:3, v1=1L), | ||||||||||||||||||||||||||||||
d2 = data.table(y=3:1, v2=2L), | ||||||||||||||||||||||||||||||
d3 = data.table(z=2:4, v3=3L) | ||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||
cbindlist(l) | ||||||||||||||||||||||||||||||
Comment on lines
+29
to
+34
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
also subject to the change that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if there possibly was a reason why integer() was used instead of NULL. You may want to look at other PRs that this one was split into, maybe there is a use of it. Unfortunately don't remember now. |
||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
\keyword{ data } |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,81 @@ | ||||||||||
#include "data.table.h" | ||||||||||
|
||||||||||
void mergeIndexAttrib(SEXP to, SEXP from) { | ||||||||||
if (!isInteger(to) || LENGTH(to)!=0) | ||||||||||
internal_error(__func__, "'to' must be integer() already"); // # nocov | ||||||||||
if (isNull(from)) | ||||||||||
return; | ||||||||||
SEXP t = ATTRIB(to), f = ATTRIB(from); | ||||||||||
if (isNull(f)) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
return; | ||||||||||
if (isNull(t)) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
SET_ATTRIB(to, shallow_duplicate(f)); | ||||||||||
else { | ||||||||||
for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I understand what's happening here. Please add a one-sentence description of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
SETCDR(t, shallow_duplicate(f)); | ||||||||||
} | ||||||||||
return; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
skip empty return at end |
||||||||||
} | ||||||||||
|
||||||||||
SEXP cbindlist(SEXP x, SEXP copyArg) { | ||||||||||
if (!isNewList(x) || isFrame(x)) | ||||||||||
error(_("'%s' must be a list"), "x"); | ||||||||||
if (!IS_TRUE_OR_FALSE(copyArg)) | ||||||||||
error(_("'%s' must be TRUE or FALSE"), "copy"); | ||||||||||
bool copy = (bool)LOGICAL(copyArg)[0]; | ||||||||||
const bool verbose = GetVerbose(); | ||||||||||
double tic = 0; | ||||||||||
if (verbose) | ||||||||||
tic = omp_get_wtime(); | ||||||||||
int nx = length(x), nans = 0, nr = -1, *nnx = (int*)R_alloc(nx, sizeof(int)); | ||||||||||
MichaelChirico marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
bool recycle = false; | ||||||||||
for (int i=0; i<nx; ++i) { | ||||||||||
SEXP thisx = VECTOR_ELT(x, i); | ||||||||||
if (!perhapsDataTable(thisx)) | ||||||||||
error(_("Element %d of 'l' list is not a data.table."), i+1); | ||||||||||
nnx[i] = n_columns(thisx); | ||||||||||
if (!nnx[i]) | ||||||||||
continue; | ||||||||||
int thisnr = n_rows(thisx); | ||||||||||
if (nr < 0) // first (non-zero length table) iteration | ||||||||||
nr = thisnr; | ||||||||||
else if (nr != thisnr) { | ||||||||||
if (!copy) | ||||||||||
error(_("For copy=FALSE all non-empty tables in 'l' have to have the same number of rows, but l[[%d]] has %d rows which differs from the previous non-zero number of rows (%d)."), i+1, thisnr, nr); | ||||||||||
recycle = true; | ||||||||||
} | ||||||||||
nans += nnx[i]; | ||||||||||
} | ||||||||||
if (recycle) | ||||||||||
error(_("Recycling rows is not yet implemented.")); // dont we have a routines for that already somewhere? | ||||||||||
SEXP ans = PROTECT(allocVector(VECSXP, nans)); | ||||||||||
SEXP index = PROTECT(allocVector(INTSXP, 0)); | ||||||||||
SEXP key = R_NilValue; | ||||||||||
setAttrib(ans, sym_index, index); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could also set index below once, after checking if it contains atttributes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We set here integer() btw There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes and AFAIU the reason for setting it to |
||||||||||
SEXP names = PROTECT(allocVector(STRSXP, nans)); | ||||||||||
for (int i=0, ians=0; i<nx; ++i) { | ||||||||||
int protecti =0; | ||||||||||
SEXP thisx = VECTOR_ELT(x, i); | ||||||||||
SEXP thisnames = PROTECT(getAttrib(thisx, R_NamesSymbol)); protecti++; | ||||||||||
for (int j=0; j<nnx[i]; ++j, ++ians) { | ||||||||||
SEXP thisxcol; | ||||||||||
if (copy) { | ||||||||||
thisxcol = PROTECT(duplicate(VECTOR_ELT(thisx, j))); protecti++; | ||||||||||
} else { | ||||||||||
thisxcol = VECTOR_ELT(thisx, j); | ||||||||||
} | ||||||||||
SET_VECTOR_ELT(ans, ians, thisxcol); | ||||||||||
SET_STRING_ELT(names, ians, STRING_ELT(thisnames, j)); | ||||||||||
} | ||||||||||
mergeIndexAttrib(index, getAttrib(thisx, sym_index)); | ||||||||||
if (isNull(key)) // first key is retained | ||||||||||
key = getAttrib(thisx, sym_sorted); | ||||||||||
UNPROTECT(protecti); | ||||||||||
} | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
do not set index as empty vector |
||||||||||
setAttrib(ans, R_NamesSymbol, names); | ||||||||||
setAttrib(ans, sym_sorted, key); | ||||||||||
if (verbose) | ||||||||||
Rprintf(_("cbindlist: took %.3fs\n"), omp_get_wtime()-tic); | ||||||||||
UNPROTECT(3); | ||||||||||
return ans; | ||||||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
require(data.table) | ||
test.data.table(script="mergelist.Rraw") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we also do not set
"index"
tointeger()
initially, see also.shallow