diff --git a/NAMESPACE b/NAMESPACE index 2bc30543f8..be238d75b5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,6 +58,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(cbindlist) export(substitute2) #export(DT) # mtcars |> DT(i,j,by) #4872 #5472 diff --git a/R/mergelist.R b/R/mergelist.R new file mode 100644 index 0000000000..9606ce0abb --- /dev/null +++ b/R/mergelist.R @@ -0,0 +1,9 @@ +cbindlist = function(l, copy=TRUE) { + ans = .Call(Ccbindlist, l, copy) + if (anyDuplicated(names(ans))) { ## invalidate key and index + setattr(ans, "sorted", NULL) + setattr(ans, "index", integer()) + } + setDT(ans) + ans +} diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw new file mode 100644 index 0000000000..693efa21d4 --- /dev/null +++ b/inst/tests/mergelist.Rraw @@ -0,0 +1,1031 @@ +require(methods) + +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + mergepair = data.table:::mergepair + perhaps.data.table = data.table:::perhaps.data.table + hasindex = data.table:::hasindex + fdistinct = data.table:::fdistinct + forderv = data.table:::forderv +} + +addresses = function(x) vapply(x, address, "") +copied = function(ans, l) { + all(!addresses(ans) %chin% unlist(recursive=FALSE, lapply(l, addresses))) +} +notcopied = function(ans, l, how="left", unless=character()) { + if (how %chin% unless) return(copied(ans, l)) ## used during looping tests for easier escape + if (how=="full") return( ## either side, left|right, notcopied is fine + all(addresses(l[[1L]]) %chin% addresses(ans)) || all(addresses(l[[length(l)]]) %chin% addresses(ans)) + ) + all(addresses(if (how=="right") l[[length(l)]] else l[[1L]]) %chin% addresses(ans)) +} + +# internal helpers + +test(1.01, perhaps.data.table(list())) +test(1.02, perhaps.data.table(list(a=1:2))) +test(1.03, perhaps.data.table(list(a=1:2, b=1:2))) +test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE) + +test(2.01, fdistinct(list(x=c(1L,1:2), b=1:2), on="x", mult="last"), error="must be data.table") +test(2.02, fdistinct(data.table(x=c(1L,1:2)), on="z", mult="last"), error="must be character column names of") +test(2.03, fdistinct(data.table(x=c(1L,1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of") +test(2.04, fdistinct(data.table(x=c(1L,1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE") +d = data.table(x=1:2, y=1:2) +test(2.05, ans<-fdistinct(d, on="x", mult="last"), d) +test(2.06, intersect(addresses(ans), addresses(d)), character()) +test(2.07, ans<-fdistinct(d, on="x", mult="last", copy=FALSE), d) +test(2.08, addresses(ans), addresses(d)) +d = data.table(x=c(2:1,2L), y=1:3) +test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2)) +test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3)) +setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386 +test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2)) + +test(3.01, hasindex(d, "x")) +test(3.02, hasindex(d, "x", retGrp=TRUE)) +setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x")) ## retGrp=F index #4386 +test(3.03, hasindex(d, "x")) +test(3.04, !hasindex(d, "x", retGrp=TRUE)) +setattr(d, "index", NULL) +test(3.05, !hasindex(d, "x")) +test(3.06, !hasindex(d, "x", retGrp=TRUE)) +setattr(d, "index", integer()) +test(3.07, !hasindex(d, "x")) +test(3.08, !hasindex(d, "x", retGrp=TRUE)) +rm(d) + +# cbindlist + +l = list( + d1 = data.table(x=1:3, v1=1L), + d2 = data.table(y=3:1, v2=2L), + d3 = data.table(z=2:4, v3=3L) +) +ans = cbindlist(l) +expected = data.table(l$d1, l$d2, l$d3) +test(11.01, ans, expected) +test(11.02, intersect(addresses(ans), addresses(expected)), character()) +ans = cbindlist(l, copy=FALSE) +expected = setDT(c(l$d1, l$d2, l$d3)) +test(11.03, ans, expected) +test(11.04, length(intersect(addresses(ans), addresses(expected))), ncol(expected)) +test(11.05, cbindlist(list(data.table(a=1L), data.table(), data.table(d=2L), data.table(f=3L))), data.table(a=1L,d=2L,f=3L)) +rm(expected) +## codecov +test(12.01, cbindlist(data.frame(a=1L), data.frame(b=1L)), error="must be a list") +test(12.02, cbindlist(TRUE, FALSE), error="must be a list") +test(12.03, cbindlist(list(), NA), error="must be TRUE or FALSE") +test(12.04, cbindlist(list(data.table(a=1L), 1L)), error="is not of data.table type") +op = options(datatable.verbose=TRUE) +test(12.05, cbindlist(list(data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2), output="cbindlist.*took") +options(op) +test(12.06, cbindlist(list(data.table(), data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2)) +test(12.07, cbindlist(list(data.table(), data.table(a=1:2), list(b=1:2))), data.table(a=1:2, b=1:2)) +test(12.08, cbindlist(list(data.table(a=integer()), list(b=integer()))), data.table(a=integer(), b=integer())) +## duplicated names +test(12.09, cbindlist(list(data.table(a=1L, b=2L), data.table(b=3L, d=4L))), data.table(a=1L, b=2L, b=3L, d=4L)) +ans = cbindlist(list(setindexv(data.table(a=2:1, b=1:2),"a"), data.table(a=1:2, b=2:1, key="a"), data.table(a=2:1, b=1:2))) +test(12.10, ans, data.table(a=2:1, b=1:2, a=1:2, b=2:1, a=2:1, b=1:2)) +test(12.11, indices(ans), NULL) +## recycling, first ensure cbind recycling that we want to match to +test(12.12, cbind(data.table(x=integer()), data.table(a=1:2)), data.table(x=c(NA_integer_,NA), a=1:2)) +test(12.13, cbind(data.table(x=1L), data.table(a=1:2)), data.table(x=c(1L,1L), a=1:2)) +test(12.14, cbindlist(list(data.table(a=integer()), data.table(b=1:2))), error="recycling.*not yet implemented") +test(12.15, cbindlist(list(data.table(a=1L), data.table(b=1:2))), error="recycling.*not yet implemented") +test(12.16, cbindlist(list(data.table(a=integer()), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow") +test(12.17, cbindlist(list(data.table(a=1L), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow") + +## retain indices +d = data.table(x=1:2, y=2:1, z=2:1, v1=1:2) ## ensure setDT will retain key and indices when it is called on the list, bc Ccbindlist returns list +setkeyv(d, "x"); setindexv(d, list("y", "z")) +a = attributes(d) +attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")] +test(13.01, class(d), "list") +setDT(d) +test(13.02, key(d), "x") +test(13.03, hasindex(d, "y") && hasindex(d, "z")) +l = list( + data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5), + data.table(id4=5:1, id5=1:5, v2=1:5), + data.table(id6=5:1, id7=1:5, v3=1:5), + data.table(id8=5:1, id9=5:1, v4=1:5) +) +setkeyv(l[[1L]], "id1"); setindexv(l[[1L]], list("id1", "id2", "id3", c("id1","id2","id3"))); setindexv(l[[3L]], list("id6", "id7")); setindexv(l[[4L]], "id9") +ii = lapply(l, indices) +ans = cbindlist(l) +test(13.04, key(ans), "id1") +test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9")) +test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib + +# mergepair + +## test copy-ness argument in mergepair + +### LHS equal to RHS: no copy in all cases +num = 21.000 +l = list( + lhs = data.table(id1=1:2, v1=1:2), + rhs = data.table(id1=1:2, v2=1:2) +) +expected = data.table(id1=1:2, v1=1:2, v2=1:2) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected) ## copy=TRUE: no shared columns + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected) ## copy=FALSE: LHS shared but no RHS + test(num<-num+0.001, notcopied(ans, l, how=how)) + } +} +### RHS includes LHS: no copy in inner, left, right +num = 22.000 +unless = "full" +l = list( + lhs = data.table(id1=1:2, v1=1:2), + rhs = data.table(id1=1:3, v2=1:3) +) +expected = list( + inner = data.table(id1=1:2, v1=1:2, v2=1:2), + left = data.table(id1=1:2, v1=1:2, v2=1:2), + right = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3), + full = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS includes RHS: no copy in left, right, full +num = 23.000 +unless = "inner" +l = list( + lhs = data.table(id1=1:3, v1=1:3), + rhs = data.table(id1=1:2, v2=1:2) +) +expected = list( + inner = data.table(id1=1:2, v1=1:2, v2=1:2), + left = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)), + right = data.table(id1=1:2, v1=1:2, v2=1:2), + full = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS single nonmatch RHS on both sides: no copy in left, right +num = 24.000 +unless = c("inner","full") +l = list( + lhs = data.table(id1=3:1, v1=1:3), + rhs = data.table(id1=c(4L,2:1), v2=1:3) +) +expected = list( + inner = data.table(id1=2:1, v1=2:3, v2=2:3), + left = data.table(id1=3:1, v1=1:3, v2=c(NA,2:3)), + right = data.table(id1=c(4L,2:1), v1=c(NA,2:3), v2=1:3), + full = data.table(id1=c(3:1,4L), v1=c(1:3,NA), v2=c(NA,2:3,1L)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS zero match RHS: no copy in left, right +num = 25.000 +unless = c("inner","full") +l = list( + lhs = data.table(id1=2:1, v1=1:2), + rhs = data.table(id1=3:4, v2=1:2) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)), + right = data.table(id1=3:4, v1=c(NA_integer_,NA), v2=1:2), + full = data.table(id1=c(2:1,3:4), v1=c(1:2,NA,NA), v2=c(NA,NA,1:2)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS and RHS zero nrow: no copies +num = 26.000 +unless = character() +l = list( + lhs = data.table(id1=integer(), v1=integer()), + rhs = data.table(id1=integer(), v2=integer()) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=integer(), v1=integer(), v2=integer()), + right = data.table(id1=integer(), v1=integer(), v2=integer()), + full = data.table(id1=integer(), v1=integer(), v2=integer()) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS has zero nrow: no copies +num = 27.000 +unless = character() +l = list( + lhs = data.table(id1=integer(), v1=integer()), + rhs = data.table(id1=2:1, v2=1:2) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=integer(), v1=integer(), v2=integer()), + right = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2), + full = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### RHS has zero nrow +num = 28.000 +unless = "inner" +l = list( + lhs = data.table(id1=2:1, v1=1:2), + rhs = data.table(id1=integer(), v2=integer()) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)), + right = data.table(id1=integer(), v1=integer(), v2=integer()), + full = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} + +# mergelist + +## coverage +test(101.01, mergelist(data.table(x=1L), on="x"), error="must be a list") +test(101.02, mergelist(list(data.table(x=1L)), on="x", copy=NA), error="must be TRUE or FALSE") +test(101.03, mergelist(list(data.table(x=1L), data.table(x=1L)), how="cross", on="y"), error="cross join must be used with zero-length on, mult='all', join.many=TRUE") +test(101.04, mergelist(list(data.table(x=1L), list(x=1:2, y=1L)), on="x"), error="must be data.table objects") +l = list(d<-data.table(x=1:2)) +test(101.05, ans<-mergelist(l, on="x", how="left", mult="first"), d) +test(101.06, intersect(addresses(d), addresses(ans)), character()) +test(101.07, ans<-mergelist(l, on="x", how="left", mult="first", copy=FALSE), d) +test(101.08, addresses(d), addresses(ans)) +op = options("datatable.verbose"=TRUE) +test(101.09, mergelist(l, on="x"), d, output="mergelist.*1 table.*took") +options(op) +l = list(data.table(x=1:2), data.table(x=2:3)) +test(101.10, mergelist(l, on=character()), error="non-zero length character vector") +op = options("datatable.verbose"=TRUE) +test(101.11, mergelist(l, on="x"), data.table(x=1:2), output="mergelist.*2 tables.*took") +options(op) +test(101.12, mergelist(l, on="xx"), error="are not present in LHS") +test(101.13, mergelist(l, on="x", join.many=NA), error="must be TRUE or FALSE") +test(101.14, mergelist(list(data.table(a=1L), data.table(a=c(1L,1L))), on="a", mult="all"), data.table(a=c(1L,1L))) ## copyCols(, cols=integer()) +test(101.15, mergelist(list()), data.table()) +test(101.16, mergelist(list(data.table())), error="must be non-zero columns tables") +test(101.17, mergelist(list(data.table(), data.table())), error="must be non-zero columns tables") +test(101.18, mergelist(list(data.table(a=integer()), data.table(a=integer())), on="a"), data.table(a=integer())) +test(101.19, mergelist(list(data.table(a=1L), data.table(a=1L, b=1L, b=1L)), on="a"), error="have duplicated column names") +test(101.20, mergelist(list(data.table(a=1L, b=1L), data.table(a=1L, b=2L)), on="a"), error="merge result has duplicated column names") +test(101.21, mergelist(list(data.table(a=1L, b=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, character())), data.table(a=1L, b=1L)) +test(101.22, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a",NA))), error="must be a list of non-zero length, non-NA, non-duplicated, character vectors") +test(101.23, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a","a"))), error="must be a list of non-zero length, non-NA, non-duplicated, character vectors") +test(101.24, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", join.many=list(TRUE, TRUE)), error="must be TRUE or FALSE, or a list of such which length must be") +test(101.25, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", join.many=list(NA)), error="must be TRUE or FALSE, or a list of such which length must be") +test(101.26, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on=c("a","a")), error="non-NA, non-duplicated, character vector, or a list") +test(101.27, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on=c("a",NA)), error="non-NA, non-duplicated, character vector, or a list") +test(101.28, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L), data.table(a=1L)), on=list("a", c("a",NA))), error="non-NA, non-duplicated, character vector, or a list") +test(101.29, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L), data.table(a=1L)), on=list("a", NULL)), error="non-NA, non-duplicated, character vector, or a list") +test(101.30, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L), data.table(a=1L)), on=list("a", c("a","a"))), error="non-NA, non-duplicated, character vector, or a list") +test(101.31, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on=list("a","a")), error="non-NA, non-duplicated, character vector, or a list") +test(101.32, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a","a"))), error="list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULL") +test(101.33, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a",NA))), error="list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULL") +test(101.34, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a","a"))), error="list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULL") +test(101.35, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL)), error="must be same length") +test(101.36, mergelist(list(data.table(x=1L), data.table(y=1L)), how="cross", mult="first"), error="cross join must be used with zero-length on, mult='all', join.many=TRUE") +test(101.37, mergelist(list(data.table(x=1L), data.table(y=1L)), how="cross", join.many=FALSE), error="cross join must be used with zero-length on, mult='all', join.many=TRUE") +test(101.38, mergelist(list(data.table(x=1L), data.table(x=1L)), how="semi", on="x", mult="all"), error="semi and anti joins must be used with mult!='all'") +test(101.39, mergelist(list(data.table(x=1L), data.table(x=1L)), how="anti", on="x", mult="all"), error="semi and anti joins must be used with mult!='all'") +test(101.41, mergelist(list(data.table(id1=c(3:2,2L), v1=1:3), data.table(id1=c(2L,2:1), v2=1:3)), how="full", on="id1", mult="error"), error="multiple matches during merge") +test(101.42, mergelist(list(data.table(id1=3:2, v1=1:2), data.table(id1=c(2L,2:1), v2=1:3)), how="full", on="id1", mult="error"), error="multiple matches during merge") +test(101.43, mergelist(list(data.table(id1=c(3:2,2L), v1=1:3), data.table(id1=2:1, v2=2:3)), how="full", on="id1", mult="error"), error="multiple matches during merge") ## how="full" mult="error" confirm that second binary merge anti join 'bns', is covering both sides error, this test doesnt ensure about that but serves a data that has been used for debugging with extra 'cat("before|after bns\n")' +test(101.44, mergelist(list(data.table(x=1L), data.table(x=1L)), on="x", mult="bad"), error="'mult' must be one of [error, all, first, last] or NULL, or a list") +test(101.45, mergelist(list(data.table(x=1:2), data.table(x=1L, v2=1L)), on="x"), data.table(x=1:2, v2=c(1L,NA))) ## match.arg sets how="left", only when missing or NULL, otherwise we do own check +test(101.46, mergelist(list(data.table(x=1:2), data.table(x=1L, v2=1L)), on="x", how=NULL), data.table(x=1:2, v2=c(1L,NA))) ## still match.arg +test(101.47, mergelist(list(data.table(x=1L), data.table(x=1L)), on="x", how="bad"), error="'how' must be one of [left, inner, full, right, semi, anti, cross], or a list") +test(101.48, mergelist(list(data.table(x=1L), data.table(x=1L)), on="x", how=list("bad")), error="'how' must be one of [left, inner, full, right, semi, anti, cross], or a list") + +## missing on argument +l = list(data.table(x=1:2), data.table(x=2:3)) +test(102.01, mergelist(l, how="inner"), error="necessary key is not present") +test(102.02, mergelist(l, how="left"), error="necessary key is not present") +test(102.03, mergelist(l, how="right"), error="necessary key is not present") +test(102.04, mergelist(l, how="full"), error="necessary key is not present") +l = list(data.table(x=1:2, key="x"), data.table(x=2:3)) +test(102.11, mergelist(l, how="inner"), data.table(x=2L, key="x")) +test(102.12, mergelist(l, how="left"), error="necessary key is not present") +test(102.13, mergelist(l, how="right"), data.table(x=2:3)) +test(102.14, mergelist(l, how="full"), data.table(x=1:3)) +l = list(data.table(x=1:2), data.table(x=2:3, key="x")) +test(102.21, mergelist(l, how="inner"), data.table(x=2L)) +test(102.22, mergelist(l, how="left"), data.table(x=1:2)) +test(102.23, mergelist(l, how="right"), error="necessary key is not present") +test(102.24, mergelist(l, how="full"), data.table(x=1:3)) +l = list(data.table(x=1:2, key="x"), data.table(x=2:3, key="x")) +test(102.31, mergelist(l, how="inner"), data.table(x=2L, key="x")) ## ordered subset +test(102.32, mergelist(l, how="left"), data.table(x=1:2, key="x")) +test(102.33, mergelist(l, how="right"), data.table(x=2:3, key="x")) +test(102.34, mergelist(l, how="full"), data.table(x=1:3)) +l = list(data.table(x=1:2, y=1:2, z=1:2, zz=1:2, key=c("y","x","z","zz")), data.table(a=2:3, b=2:3, x=2:3, y=2:3, key=c("x","y","a"))) +test(102.41, mergelist(l, how="inner"), data.table(x=2L, y=2L, z=2L, zz=2L, a=2L, b=2L, key=c("y","x","z","zz"))) ## key len 2+ to take intersect, and align order, for inner and full +test(102.42, mergelist(l, how="left"), error="specify columns to join.*that are not present in LHS table") +test(102.43, mergelist(l, how="right"), error="specify columns to join.*that are not present in RHS table") +test(102.44, mergelist(l, how="full"), data.table(x=1:3, y=1:3, z=c(1:2,NA), zz=c(1:2,NA), a=c(NA,2:3), b=c(NA,2:3))) +l = list(data.table(a=1:2, x=1:2, key=c("x","a")), data.table(x=2:3, y=2:3, z=2:3, key=c("y","x","z"))) +test(102.51, mergelist(l, how="inner"), data.table(x=2L, a=2L, y=2L, z=2L, key=c("x","a"))) ## align order to shorter +test(102.52, mergelist(l, how="left"), error="specify columns to join.*that are not present in LHS table") +test(102.53, mergelist(l, how="right"), error="specify columns to join.*that are not present in RHS table") +test(102.54, mergelist(l, how="full"), data.table(x=1:3, a=c(1:2,NA), y=c(NA,2:3), z=c(NA,2:3))) +## missing on, cascade join fields +l = list( + fact = data.table(id=1:16, state_id=1:8, population=1, key="id"), + state = data.table(state_id=1:8, division_id=1:4, key="state_id"), + division = data.table(division_id=1:4, region_id=1:2, key="division_id"), + region = data.table(region_id=1:2, key="region_id") +) +ans1 = mergelist(l, how="left") +ans2 = mergelist(rev(l), how="right") +test(102.611, all.equal(ans1, ans2, ignore.col.order=TRUE)) +test(102.612, ans1, data.table(key="id", region_id=rep(1:2, 8), division_id=rep(1:4, 4), state_id=rep(1:8, 2), id=1:16, population=1)) +test(102.613, copied(ans1, l)) +test(102.614, copied(ans2, l)) +ans1 = mergelist(l, how="left", copy=FALSE) +ans2 = mergelist(rev(l), how="right", copy=FALSE) +test(102.621, all.equal(ans1, ans2, ignore.col.order=TRUE)) +test(102.622, ans1, data.table(key="id", region_id=rep(1:2, 8), division_id=rep(1:4, 4), state_id=rep(1:8, 2), id=1:16, population=1)) +test(102.623, notcopied(ans1, l)) +test(102.624, notcopied(ans2, rev(l), how="right")) +test(102.625, !notcopied(ans1, l, how="right")) ## test notcopied helper function rather than mergelist +test(102.626, !notcopied(ans2, rev(l), how="left")) ## test notcopied +l = list( ## duplicates on one level + fact = data.table(id=1:16, state_id=1:8, population=1, key="id"), + state = data.table(state_id=1:8, division_id=1:4, key="state_id"), + division = data.table(division_id=c(1:4,1:2), region_id=1:2, key="division_id"), + region = data.table(region_id=2:1, key="region_id") +) +test(102.631, mergelist(l), error="multiple matches during merge") +test(102.632, nrow(ans1<-mergelist(l, mult="all")), 24L) +test(102.633, mergelist(l, how="right"), error="are not present in RHS table") +test(102.634, mergelist(rev(l), how="right"), error="multiple matches during merge") +test(102.635, nrow(ans2<-mergelist(rev(l), how="right", mult="all")), 24L) +test(102.636, all.equal(ans1, ans2, ignore.col.order=TRUE)) +rm(ans1, ans2) +## on list +test(102.71, mergelist(list(data.table(x=1L, y=2L), data.table(a=1L, y=2L), data.table(a=1L, z=2L)), on=list("y","a")), data.table(a=1L, y=2L, x=1L, z=2L)) +test(102.72, mergelist(list(data.table(x=1L, y=2L), data.table(a=1L, y=2L, b=3L), data.table(a=1L, b=3L, z=2L)), on=list("y",c("a","b"))), data.table(a=1L, b=3L, y=2L, x=1L, z=2L)) +test(102.73, mergelist(list(data.table(x=1L, y=2L), data.table(a=1L, y=2L, b=3L), data.table(a=1L, b=3L, z=2L)), on=list("y",c("a","x"))), error="specify columns to join.*that are not present in RHS table") + +## cols argument +l = list(data.table(id1=1:2, v1=1:2, v2=2:1, key="id1"), data.table(id1=2:3, v3=1:2, v4=2:1, key="id1")) +test(103.01, mergelist(l, how="inner"), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.02, mergelist(l, how="left"), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.03, mergelist(l, how="right"), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.04, mergelist(l, how="full"), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +test(103.11, mergelist(l, how="inner", cols="v2"), error="must be a list") +test(103.12, mergelist(l, how="inner", cols=list("v2")), error="must be same length as") +test(103.13, mergelist(l, how="inner", cols=list("v2",2L)), error="must be a list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULL") +test(103.14, mergelist(l, how="inner", cols=list("v2","v5")), error="specify columns not present in corresponding table") +cols = list(c("v1","v2"), c("v3","v4")) +test(103.21, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.22, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.23, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.24, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list(NULL, c("v3","v4")) +test(103.25, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.26, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.27, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.28, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list(c("v1","v2"), NULL) +test(103.29, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.30, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.31, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.32, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list(NULL, NULL) +test(103.33, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.34, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.35, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.36, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list("v2", NULL) +test(103.41, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.42, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.43, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.44, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list(NULL, "v4") +test(103.45, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v4=2L, key="id1")) +test(103.46, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v4=c(NA,2L), key="id1")) +test(103.47, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v4=2:1, key="id1")) +test(103.48, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v4=c(NA,2:1))) +cols = list("v2", "v4") +test(103.49, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v2=1L, v4=2L, key="id1")) +test(103.50, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v2=2:1, v4=c(NA,2L), key="id1")) +test(103.51, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v2=c(1L,NA), v4=2:1, key="id1")) +test(103.52, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v2=c(2:1,NA), v4=c(NA,2:1))) +cols = list(c("id1","v2"), c("id1","v4")) +test(103.61, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v2=1L, v4=2L, key="id1")) +test(103.62, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v2=2:1, v4=c(NA,2L), key="id1")) +test(103.63, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v2=c(1L,NA), v4=2:1, key="id1")) +test(103.64, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v2=c(2:1,NA), v4=c(NA,2:1))) +cols = list("id1", c("id1","v4")) +test(103.65, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v4=2L, key="id1")) +test(103.66, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v4=c(NA,2L), key="id1")) +test(103.67, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v4=2:1, key="id1")) +test(103.68, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v4=c(NA,2:1))) +cols = list("id1", "id1") +test(103.69, mergelist(l, how="inner", cols=cols), data.table(id1=2L, key="id1")) +test(103.70, mergelist(l, how="left", cols=cols), data.table(id1=1:2, key="id1")) +test(103.71, mergelist(l, how="right", cols=cols), data.table(id1=2:3, key="id1")) +test(103.72, mergelist(l, how="full", cols=cols), data.table(id1=1:3)) + +## join.many argument #4383 +d = function(n) as.data.table(list(x=rep(1L, n))) +l = list(fm=d(1), to=d(1)) +test(104.01, mergelist(l, on="x", how="left"), l$to[l$fm, on="x"]) +l = list(fm=d(2), to=d(1)) +test(104.02, mergelist(l, on="x", how="left"), l$to[l$fm, on="x"]) +test(104.03, mergelist(l, on="x", how="left", mult="error"), l$to[l$fm, on="x", mult="error"]) ## mult="error" has no effect +l = list(fm=d(1), to=d(2)) +test(104.04, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x"]) +test(104.05, mergelist(l, on="x", how="left"), error="multiple matches during merge") +test(104.06, l$to[l$fm, on="x", mult="error"], error="multiple matches during merge") +options(datatable.join.many=FALSE) +test(104.07, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x"]) ## covers !join.many && length(f__)==1L && len__==nrow(x) +options(datatable.join.many=TRUE) +l = list(fm=d(2), to=d(2)) +options(datatable.join.many=FALSE) +test(104.08, mergelist(l, on="x", how="left", mult="all"), error="many-to-many join") +test(104.09, l$to[l$fm, on="x"], error="many-to-many join") +options(datatable.join.many=TRUE) +test(104.10, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x"]) ## join in [ does not stop on cartesian product +l = list(fm=d(3), to=d(1)) +test(104.11, mergelist(l, on="x", how="left"), l$to[l$fm, on="x"]) +l = list(fm=d(1), to=d(3)) +test(104.12, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x"]) +test(104.13, mergelist(l, on="x", how="left"), error="multiple matches during merge") +test(104.14, l$to[l$fm, on="x", mult="error"], error="multiple matches during merge") +l = list(fm=d(3), to=d(2)) +options(datatable.join.many=FALSE) +test(104.15, mergelist(l, on="x", how="left", mult="all"), error="many-to-many join") +test(104.16, l$to[l$fm, on="x"], error="many-to-many join") +options(datatable.join.many=TRUE) +test(104.17, l$to[l$fm, on="x"], error="Check for duplicate key values") +test(104.18, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x", allow.cartesian=TRUE]) +l = list(fm=d(2), to=d(3)) +options(datatable.join.many=FALSE) +test(104.19, mergelist(l, on="x", how="left", mult="all"), error="many-to-many join") +test(104.20, l$to[l$fm, on="x"], error="many-to-many join") +options(datatable.join.many=TRUE) +test(104.21, l$to[l$fm, on="x"], error="Check for duplicate key values") +test(104.22, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x", allow.cartesian=TRUE]) +l = list(fm=d(3), to=d(3)) +options(datatable.join.many=FALSE) +test(104.23, mergelist(l, on="x", how="left", mult="all"), error="many-to-many join") +test(104.24, l$to[l$fm, on="x"], error="many-to-many join") +options(datatable.join.many=TRUE) +test(104.25, l$to[l$fm, on="x"], error="Check for duplicate key values") +test(104.26, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x", allow.cartesian=TRUE]) +## join.many list +test(104.31, mergelist(list(data.table(id1=c(1L,1L), v1=1:2), data.table(id1=c(1L,1L), v2=1:2), data.table(id1=1L, v3=1L)), on="id1", mult="all", join.many=list(TRUE,FALSE)), data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2), v3=c(1L,1L,1L,1L))) +test(104.32, mergelist(list(data.table(id1=c(1L,1L), v1=1:2), data.table(id1=c(1L,1L), v2=1:2), data.table(id1=1L, v3=1L)), on="id1", mult="all", join.many=list(TRUE,TRUE)), data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2), v3=c(1L,1L,1L,1L))) +test(104.33, mergelist(list(data.table(id1=c(1L,1L), v1=1:2), data.table(id1=c(1L,1L), v2=1:2), data.table(id1=1L, v3=1L)), on="id1", mult="all", join.many=list(FALSE,TRUE)), error="Joining resulted in many-to-many join") +test(104.34, mergelist(list(data.table(id1=c(1L,1L), v1=1:2), data.table(id1=c(1L,1L), v2=1:2), data.table(id1=1L, v3=1L)), on="id1", mult="all", join.many=list(TRUE,NA)), error="must be TRUE or FALSE, or a list of such") + +## how list +test(105.01, mergelist(list(data.table(a=1:3, b=1:3), data.table(a=2L, d=1L), data.table(a=c(1:2,4L), f=1:3)), on="a", how=list("left","full")), data.table(a=1:4, b=c(1:3,NA), d=c(NA,1L,NA,NA), f=c(1:2,NA,3L))) +test(105.02, mergelist(list(data.table(a=1:3, b=1:3), data.table(a=2L, d=1L), data.table(a=c(1:2,4L), f=1:3)), on="a", how=list("left","inner")), data.table(a=1:2, b=1:2, d=c(NA,1L), f=1:2)) +Persons = data.table(PersonName=c("Alice","Bob","Charles"), key="PersonName") ## right outer join use case +Pets = data.table(PetName=c("Rover","Lassie","Fifi"), PersonName=c("Alice","Alice","Charles"), key="PetName") +PetAccessories = data.table(AccessoryName=c("Ball","Bone","Mouse"), PetName=c("Rover","Rover","Fifi"), key="AccessoryName") +expected = data.table(PetName=c("Rover","Rover",NA,"Fifi"), PersonName=c("Alice","Alice","Bob","Charles"), AccessoryName=c("Ball","Bone",NA,"Mouse")) +test(105.11, Pets[PetAccessories, on="PetName", nomatch=NULL][Persons, on="PersonName"], expected) ## test [.data.table +setcolorder(expected, "PersonName"); setkeyv(expected, "PersonName") ## ignore.row.order, ignore.col.order, check.attributes=FALSE +test(105.12, mergelist(list(Pets, PetAccessories, Persons), how=list("inner","right"), on=list("PetName","PersonName"), mult="all"), expected) + +## mult list +test(106.01, mergelist(list(data.table(a=1:2, b=1:2), data.table(a=c(1L,1L), d=1:2), data.table(a=c(1L,1L), f=1:2)), on="a", how="left", mult=list("last","first")), data.table(a=1:2, b=1:2, d=c(2L,NA), f=c(1L,NA))) +test(106.02, mergelist(list(data.table(a=1:2, b=1:2), data.table(a=c(1L,1L), d=1:2), data.table(a=c(1L,1L), f=1:2)), on="a", how="left", mult=list("last","error")), error="multiple matches during merge") +test(106.81, mergelist(list(data.table(a=1:2), data.table(b=1:2)), how="cross"), data.table(a=c(1L,1:2,2L), b=c(1:2,1:2))) ### mult default +test(106.82, mergelist(list(data.table(a=1:2), data.table(b=1:2), data.table(a=1:2, b=1:2)), how=list("cross","anti"), on=list(character(), c("a","b"))), data.table(a=1:2, b=2:1)) ## cool, isnt it? + +## semi join +l = list(data.table(x=c(1L,1L,1:2), y=c("a","a","a","b")), data.table(x=c(1L,1L), z=10:11)) +test(107.01, mergelist(l, how="semi", on="x", mult="first"), data.table(x=c(1L,1L,1L), y=c("a","a","a"))) +l = list(data.table(x=c(1L,3L,1:2,2L), y=c("a","c","a","b","b")), data.table(x=3:2, z=10:11)) +test(107.02, mergelist(l, how="semi", on="x", mult="first"), data.table(x=c(3:2,2L), y=c("c","b","b"))) ## rows order of x, not i +test(107.03, mergelist(list(data.table(id1=1:4, id2=4:1, v1=1L), data.table(id2=3:5, v2=2L)), on="id2", how="semi"), data.table(id1=1:2, id2=4:3, v1=1L)) ## columns order of x, not i +l = list(data.table(id=c(3L,1L,2L,1L,1L), g=c("A","A","B","B","A"), v=(1:5)*10), data.table(id=c(1L,1:3), g="A")) +test(107.11, mergelist(l, how="semi", on=c("id","g"), mult="first"), l[[1L]][ sort(unique(l[[1L]][l[[2L]], on=names(l[[2L]]), nomatch=0L, which=TRUE]))]) +test(107.81, mergelist(list(data.table(a=1:2), data.table(b=1:2), data.table(a=1:2, b=1:2)), how=list("cross","semi"), on=list(character(), c("a","b"))), data.table(a=1:2, b=1:2)) ### mult default + +## anti join +l = list(data.table(x=c(1L,1:2,2L), y=c("a","a","b","b")), data.table(x=c(1L,1L), z=10:11)) +test(108.01, mergelist(l, how="anti", on="x", mult="first"), data.table(x=c(2L,2L), y=c("b","b"))) +l = list(data.table(x=c(1L,3L,1:2,2L), y=c("a","c","a","b","b")), data.table(x=c(4L,1L), z=10:11)) +test(108.02, mergelist(l, how="anti", on="x", mult="first"), data.table(x=c(3:2,2L), y=c("c","b","b"))) ## rows order of x, not i +test(108.03, mergelist(list(data.table(id1=1:4, id2=4:1, v1=1L), data.table(id2=3:5, v2=2L)), on="id2", how="anti"), data.table(id1=3:4, id2=2:1, v1=1L)) ## columns order of x, not i + +## cross join +l = list(data.table(v1=1:2, v2=1:4), data.table(v3=1:3, v4=1:6)) +ans1 = mergelist(l, how="cross", mult="all") +l = list(data.table(v1=1:2, v2=1:4, k=1L), data.table(v3=1:3, v4=1:6, k=1L)) +ans2 = mergelist(l, how="inner", mult="all", on="k")[, "k":=NULL][] +ans3 = l[[2L]][l[[1L]], .(v1,v2,v3,v4), on="k", allow.cartesian=TRUE] +test(109.01, ans1, ans2) +test(109.02, ans1, ans3) +expected = data.table(v1=integer(), v2=integer(), v3=integer(), v4=integer()) +test(109.03, mergelist(list(data.table(v1=1:2, v2=1:4), data.table(v3=integer(), v4=integer())), how="cross", mult="all"), expected) +test(109.04, mergelist(list(data.table(v1=integer(), v2=integer()), data.table(v3=1:3, v4=1:6)), how="cross", mult="all"), expected) +test(109.05, mergelist(list(data.table(v1=integer(), v2=integer()), data.table(v3=integer(), v4=integer())), how="cross", mult="all"), expected) + +## retain index +l = list(data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3), data.table(id1=3:1, v2=1:3)) +setkeyv(l[[1L]], "id1"); setindexv(l[[1L]], "id2") +ans = mergelist(l, on="id1") +test(110.01, ans, data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3, v2=3:1, key="id1")) +test(110.02, copied(ans, l)) +test(110.03, hasindex(ans, "id2")) +ans = mergelist(l, on="id1", how="left", copy=FALSE) +test(110.04, ans, data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3, v2=3:1, key="id1")) +test(110.05, notcopied(ans, l)) +test(110.06, hasindex(ans, "id2")) +ans = mergelist(l, on="id1", how="full") +test(110.07, ans, data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3, v2=3:1, key="id1")) +test(110.08, hasindex(ans, "id2")) +l = list(data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3, key="id1"), data.table(id1=4:1, v2=1:4)) +test(110.09, !hasindex(mergelist(l, on="id1", how="full"), "id2")) ## no index because size changes +l = list(data.table(id1=integer(), v1=integer()), data.table(id1=1:2, id2=2:1, v2=1:2)) +setkeyv(l[[2L]], "id1"); setindexv(l[[2L]], "id2") +ans = mergelist(l, on="id1", how="full") +test(110.10, ans, data.table(id1=1:2, v1=c(NA_integer_,NA), id2=2:1, v2=1:2, key="id1")) +test(110.11, hasindex(ans, "id2")) +l = list(data.table(id1=3:1, v1=1:3), data.table(id1=1:3, id2=c(2L,1L,2L), v2=1:3)) +setkeyv(l[[2L]], "id1"); setindexv(l[[2L]], "id2") +ans = mergelist(l, on="id1", how="right") +test(110.12, ans, data.table(id1=1:3, v1=3:1, id2=c(2L,1L,2L), v2=1:3, key="id1")) +test(110.13, copied(ans, l)) +test(110.14, hasindex(ans, "id2")) +ans = mergelist(l, on="id1", how="right", copy=FALSE) +test(110.15, ans, data.table(id1=1:3, v1=3:1, id2=c(2L,1L,2L), v2=1:3, key="id1")) +test(110.16, notcopied(ans, l, how="right")) +test(110.17, hasindex(ans, "id2")) + +## 3+ tables mergelist + +### 3 tables +l = list(data.table(id1=3:1, v1=1:3), data.table(id1=2:1, v2=1:2), data.table(id1=3:2, v3=1:2)) +test(111.01, mergelist(l, on="id1", how="left"), data.table(id1=3:1, v1=1:3, v2=c(NA,1:2), v3=c(1:2,NA))) +test(111.02, mergelist(l, on="id1", how="inner"), data.table(id1=2L, v1=2L, v2=1L, v3=2L)) +test(111.03, mergelist(l, on="id1", how="right"), data.table(id1=3:2, v1=c(NA,2L), v2=c(NA,1L), v3=1:2)) +test(111.04, mergelist(l, on="id1", how="full"), data.table(id1=3:1, v1=1:3, v2=c(NA,1:2), v3=c(1:2,NA))) +test(111.05, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(111.06, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) + +## mergelist no duplicates + +### LHS equal to RHS +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=1:2, v2=1:2)) +expected = data.table(id1=1:2, v1=1:2, v2=1:2) +test(121.01, mergelist(l, on="id1", how="inner", mult="all"), expected) +test(121.02, mergelist(l, on="id1", how="left", mult="all"), expected) +test(121.03, mergelist(l, on="id1", how="right", mult="all"), expected) +test(121.04, mergelist(l, on="id1", how="full", mult="all"), expected) +test(121.05, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.06, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### Single match +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=c(1L,3L), v2=1:2)) +test(121.11, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=1L, v1=1L, v2=1L)) +test(121.12, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:2, v1=1:2, v2=c(1L,NA))) +test(121.13, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=c(1L,3L), v1=c(1L,NA), v2=1:2)) +test(121.14, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=1:3, v1=c(1:2,NA), v2=c(1L,NA,2L))) +test(121.15, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.16, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### Two matches +l = list(lhs = data.table(id1=1:3, v1=1:3), rhs = data.table(id1=2:4, v2=1:3)) +test(121.21, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=2:3, v1=2:3, v2=1:2)) +test(121.22, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:3, v1=1:3, v2=c(NA,1:2))) +test(121.23, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=2:4, v1=c(2:3,NA), v2=1:3)) +test(121.24, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=1:4, v1=c(1:3,NA), v2=c(NA,1:3))) +test(121.25, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.26, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### Zero match +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=4:3, v2=1:2)) +test(121.31, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.32, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:2, v1=1:2, v2=c(NA_integer_,NA))) +test(121.33, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=4:3, v1=c(NA_integer_,NA), v2=1:2)) +test(121.34, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=c(1:2,4:3), v1=c(1:2,NA,NA), v2=c(NA,NA,1:2))) +test(121.35, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.36, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### LHS within RHS +l = list(lhs = data.table(id1=1:4, v1=1:4), rhs = data.table(id1=3:2, v2=1:2)) +test(121.41, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=2:3, v1=2:3, v2=2:1)) +test(121.42, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:4, v1=1:4, v2=c(NA,2:1,NA))) +test(121.43, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=3:2, v1=3:2, v2=1:2)) +test(121.44, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=1:4, v1=1:4, v2=c(NA,2:1,NA))) +test(121.45, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.46, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### RHS within LHS +l = list(lhs = data.table(id1=3:2, v1=1:2), rhs = data.table(id1=1:4, v2=1:4)) +test(121.51, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=3:2, v1=1:2, v2=3:2)) +test(121.52, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=3:2, v1=1:2, v2=3:2)) +test(121.53, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=1:4, v1=c(NA,2:1,NA), v2=1:4)) +test(121.54, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=c(3:1,4L), v1=c(1:2,NA,NA), v2=c(3:1,4L))) +test(121.55, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.56, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### RHS zero rows +l = list(lhs = data.table(id1=3:2, v1=1:2), rhs = data.table(id1=integer(), v2=integer())) +test(121.61, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.62, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=3:2, v1=1:2, v2=c(NA_integer_,NA))) +test(121.63, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.64, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=3:2, v1=1:2, v2=c(NA_integer_,NA))) +test(121.65, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.66, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### LHS zero rows +l = list(lhs = data.table(id1=integer(), v1=integer()), rhs = data.table(id1=2:1, v2=1:2)) +test(121.71, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.72, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.73, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2)) +test(121.74, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2)) +test(121.75, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.76, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### LHS and RHS zero rows +l = list(lhs = data.table(id1=integer(), v1=integer()), rhs = data.table(id1=integer(), v2=integer())) +expected = data.table(id1=integer(), v1=integer(), v2=integer()) +test(121.81, mergelist(l, on="id1", how="inner", mult="all"), expected) +test(121.82, mergelist(l, on="id1", how="left", mult="all"), expected) +test(121.83, mergelist(l, on="id1", how="right", mult="all"), expected) +test(121.84, mergelist(l, on="id1", how="full", mult="all"), expected) +test(121.85, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.86, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) + +## mergelist duplicate matches, see sqlite.Rraw for tests vs SQLite db + +### duplicates in RHS and LHS matched in both sides +num = 221.00 +l = list(lhs = data.table(id1=c(1:3,3L), v1=1:4), rhs = data.table(id1=c(1L,1L,3:4), v2=1:4)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,3L,3L), v1=c(1L,1L,3L,4L), v2=c(1:3,3L)), + first = data.table(id1=c(1L,3L), v1=c(1L,3L), v2=c(1L,3L)), + last = data.table(id1=c(1L,3L), v1=c(1L,4L), v2=2:3), + error = NULL +), left = list( + all = data.table(id1=c(1L,1:3,3L), v1=c(1L,1:4), v2=c(1:2,NA,3L,3L)), + first = data.table(id1=c(1:3,3L), v1=1:4, v2=c(1L,NA,3L,3L)), + last = data.table(id1=c(1:3,3L), v1=1:4, v2=c(2L,NA,3L,3L)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1L,3L,3:4), v1=c(1L,1L,3:4,NA), v2=c(1:3,3:4)), + first = data.table(id1=c(1L,1L,3:4), v1=c(1L,1L,3L,NA), v2=1:4), + last = data.table(id1=c(1L,1L,3:4), v1=c(1L,1L,4L,NA), v2=1:4), + error = NULL +), full = list( + all = data.table(id1=c(1L,1:3,3:4), v1=c(1L,1:4,NA), v2=c(1:2,NA,3L,3:4)), + first = data.table(id1=1:4, v1=c(1:3,NA), v2=c(1L,NA,3:4)), + last = data.table(id1=1:4, v1=c(1:2,4L,NA), v2=c(2L,NA,3:4)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +## duplicates in RHS +num = 222.00 +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=c(2L,2:3), v2=1:3)) +expected = list(inner = list( + all = data.table(id1=c(2L,2L), v1=c(2L,2L), v2=1:2), + first = data.table(id1=2L, v1=2L, v2=1L), + last = data.table(id1=2L, v1=2L, v2=2L), + error = NULL +), left = list( + all = data.table(id1=c(1:2,2L), v1=c(1:2,2L), v2=c(NA,1:2)), + first = data.table(id1=1:2, v1=1:2, v2=c(NA,1L)), + last = data.table(id1=1:2, v1=1:2, v2=c(NA,2L)), + error = NULL +), right = list( + all = data.table(id1=c(2L,2:3), v1=c(2L,2L,NA), v2=1:3), + first = data.table(id1=c(2L,2:3), v1=c(2L,2L,NA), v2=1:3), + last = data.table(id1=c(2L,2:3), v1=c(2L,2L,NA), v2=1:3), + error = data.table(id1=c(2L,2:3), v1=c(2L,2L,NA), v2=1:3) +), full = list( + all = data.table(id1=c(1:2,2:3), v1=c(1:2,2L,NA), v2=c(NA,1:3)), + first = data.table(id1=c(1:2,3L), v1=c(1:2,NA), v2=c(NA,1L,3L)), + last = data.table(id1=c(1:2,3L), v1=c(1:2,NA), v2=c(NA,2:3)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### duplicates in LHS +num = 223.00 +l = list(lhs = data.table(id1=c(1:2,2L), v1=1:3), rhs = data.table(id1=2:3, v2=1:2)) +expected = list(inner = list( + all = data.table(id1=c(2L,2L), v1=2:3, v2=c(1L,1L)), + first = data.table(id1=2L, v1=2L, v2=1L), + last = data.table(id1=2L, v1=3L, v2=1L), + error = NULL +), left = list( + all = data.table(id1=c(1:2,2L), v1=1:3, v2=c(NA,1L,1L)), + first = data.table(id1=c(1:2,2L), v1=1:3, v2=c(NA,1L,1L)), + last = data.table(id1=c(1:2,2L), v1=1:3, v2=c(NA,1L,1L)), + error = data.table(id1=c(1:2,2L), v1=1:3, v2=c(NA,1L,1L)) +), right = list( + all = data.table(id1=c(2L,2:3), v1=c(2:3,NA), v2=c(1L,1:2)), + first = data.table(id1=2:3, v1=c(2L,NA), v2=1:2), + last = data.table(id1=2:3, v1=c(3L,NA), v2=1:2), + error = NULL +), full = list( + all = data.table(id1=c(1:2,2:3), v1=c(1:3,NA), v2=c(NA,1L,1:2)), + first = data.table(id1=1:3, v1=c(1:2,NA), v2=c(NA,1:2)), + last = data.table(id1=1:3, v1=c(1L,3L,NA), v2=c(NA,1:2)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### duplicates in RHS and LHS, some RHS dups does not have matches in LHS +num = 224.00 +l = list(lhs = data.table(id1=c(1:3,3L), v1=1:4), rhs = data.table(id1=c(1L,1L,3:4,4L), v2=1:5)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,3L,3L), v1=c(1L,1L,3L,4L), v2=c(1:3,3L)), + first = data.table(id1=c(1L,3L), v1=c(1L,3L), v2=c(1L,3L)), + last = data.table(id1=c(1L,3L), v1=c(1L,4L), v2=2:3), + error = NULL +), left = list( + all = data.table(id1=c(1L,1:3,3L), v1=c(1L,1:4), v2=c(1:2,NA,3L,3L)), + first = data.table(id1=c(1:3,3L), v1=1:4, v2=c(1L,NA,3L,3L)), + last = data.table(id1=c(1:3,3L), v1=1:4, v2=c(2L,NA,3L,3L)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1L,3L,3L,4L,4L), v1=c(1L,1L,3L,4L,NA,NA), v2=c(1:3,3:5)), + first = data.table(id1=c(1L,1L,3L,4L,4L), v1=c(1L,1L,3L,NA,NA), v2=1:5), + last = data.table(id1=c(1L,1L,3L,4L,4L), v1=c(1L,1L,4L,NA,NA), v2=1:5), + error = NULL +), full = list( + all = data.table(id1=c(1L,1:3,3:4,4L), v1=c(1L,1:4,NA,NA), v2=c(1:2,NA,3L,3:5)), + first = data.table(id1=1:4, v1=c(1:3,NA), v2=c(1L,NA,3:4)), + last = data.table(id1=1:4, v1=c(1:2,4L,NA), v2=c(2L,NA,3L,5L)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### duplicates in RHS and LHS, some LHS dups does not have matches in RHS +num = 225.00 +l = list(lhs = data.table(id1=c(1L,1L,3:4,4L), v1=1:5), rhs = data.table(id1=c(1:3,3L), v2=1:4)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,3L,3L), v1=c(1:3,3L), v2=c(1L,1L,3:4)), + first = data.table(id1=c(1L,3L), v1=c(1L,3L), v2=c(1L,3L)), + last = data.table(id1=c(1L,3L), v1=2:3, v2=c(1L,4L)), + error = NULL +), left = list( + all = data.table(id1=c(1L,1L,3L,3L,4L,4L), v1=c(1:3,3:5), v2=c(1L,1L,3L,4L,NA,NA)), + first = data.table(id1=c(1L,1L,3L,4L,4L), v1=1:5, v2=c(1L,1L,3L,NA,NA)), + last = data.table(id1=c(1L,1L,3L,4L,4L), v1=1:5, v2=c(1L,1L,4L,NA,NA)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1:3,3L), v1=c(1:2,NA,3L,3L), v2=c(1L,1:4)), + first = data.table(id1=c(1:3,3L), v1=c(1L,NA,3L,3L), v2=1:4), + last = data.table(id1=c(1:3,3L), v1=c(2L,NA,3L,3L), v2=1:4), + error = NULL +), full = list( + all = data.table(id1=c(1L,1L,3L,3:4,4L,2L), v1=c(1:3,3:5,NA), v2=c(1L,1L,3:4,NA,NA,2L)), + first = data.table(id1=c(1L,3:4,2L), v1=c(1L,3:4,NA), v2=c(1L,3L,NA,2L)), + last = data.table(id1=c(1L,3:4,2L), v1=c(2:3,5L,NA), v2=c(1L,4L,NA,2L)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### cartesian match, dups on both sides of match +num = 226.00 +l = list(lhs = data.table(id1=c(1L,1:2), v1=1:3), rhs = data.table(id1=c(1L,1L,3L), v2=1:3)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2)), + first = data.table(id1=1L, v1=1L, v2=1L), + last = data.table(id1=1L, v1=2L, v2=2L), + error = NULL +), left = list( + all = data.table(id1=c(1L,1L,1L,1L,2L), v1=c(1L,1L,2L,2L,3L), v2=c(1:2,1:2,NA)), + first = data.table(id1=c(1L,1:2), v1=1:3, v2=c(1L,1L,NA)), + last = data.table(id1=c(1L,1:2), v1=1:3, v2=c(2L,2L,NA)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1L,1L,1L,3L), v1=c(1:2,1:2,NA), v2=c(1L,1:2,2:3)), + first = data.table(id1=c(1L,1L,3L), v1=c(1L,1L,NA), v2=1:3), + last = data.table(id1=c(1L,1L,3L), v1=c(2L,2L,NA), v2=1:3), + error = NULL +), full = list( + all = data.table(id1=c(1L,1L,1L,1:3), v1=c(1L,1:2,2:3,NA), v2=c(1:2,1:2,NA,3L)), + first = data.table(id1=1:3, v1=c(1L,3L,NA), v2=c(1L,NA,3L)), + last = data.table(id1=1:3, v1=c(2L,3L,NA), v2=c(2L,NA,3L)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### cross join duplicates +num = 227.00 +l = list(lhs = data.table(id1=c(1L,1L), v1=1:2), rhs = data.table(id1=c(1L,1L), v2=1:2)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2)), + first = data.table(id1=1L, v1=1L, v2=1L), + last = data.table(id1=1L, v1=2L, v2=2L), + error = NULL +), left = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2)), + first = data.table(id1=c(1L,1L), v1=1:2, v2=c(1L,1L)), + last = data.table(id1=c(1L,1L), v1=1:2, v2=c(2L,2L)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1:2,1:2), v2=c(1L,1:2,2L)), + first = data.table(id1=c(1L,1L), v1=c(1L,1L), v2=1:2), + last = data.table(id1=c(1L,1L), v1=c(2L,2L), v2=1:2), + error = NULL +), full = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2)), + first = data.table(id1=1L, v1=1L, v2=1L), + last = data.table(id1=1L, v1=2L, v2=2L), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} + +## NAs in join columns + +### LHS equal to RHS and having NA on +l = list(lhs = data.table(id1=c(1:2,NA), v1=1:3), rhs = data.table(id1=c(1:2,NA), v2=1:3)) +expected = data.table(id1=c(1:2,NA), v1=1:3, v2=1:3) +test(251.01, mergelist(l, on="id1", how="inner", mult="all"), expected) +test(251.02, mergelist(l, on="id1", how="left", mult="all"), expected) +test(251.03, mergelist(l, on="id1", how="right", mult="all"), expected) +test(251.04, mergelist(l, on="id1", how="full", mult="all"), expected) +test(251.05, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(251.06, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### Single match and RHS having NA on +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=c(1L,NA,3L), v2=1:3)) +test(251.11, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=1L, v1=1L, v2=1L)) +test(251.12, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:2, v1=1:2, v2=c(1L,NA))) +test(251.13, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=c(1L,NA,3L), v1=c(1L,NA,NA), v2=1:3)) +test(251.14, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=c(1:2,NA,3L), v1=c(1:2,NA,NA), v2=c(1L,NA,2:3))) +test(251.15, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(251.16, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) + +## exceeds overalloc for a table + +ac = getOption("datatable.alloccol") +l = list(lhs = as.data.table(c(list(id1=integer()), setNames(replicate(ac+10L,integer()), paste0("v",seq_len(ac+10L))))), rhs=data.table(id1=2:1)) +test(291.01, mergelist(l, on="id1", how="inner", mult="all"), l$lhs[0L]) +test(291.02, mergelist(l, on="id1", how="left", mult="all"), l$lhs[0L]) +test(291.03, mergelist(l, on="id1", how="right", mult="all"), l$lhs[1:2][, "id1" := 2:1][]) +test(291.04, mergelist(l, on="id1", how="full", mult="all"), rbindlist(l, use.names=TRUE, fill=TRUE)) ## test overalloc for how=="full" && !nrow(out.i) && nrow(out.r) && length(add<-setdiff(names(out.i), names(out.r))) that was failing when used set() +test(291.05, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(291.06, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) + +## fdistinct, another round + +dt = data.table(x = +c(74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L, + 49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L, + 180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L, + 52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L, + 146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L, + 28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L, + 132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L, + 108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L, + 68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L, + 167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L, + 269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L, + 137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L, + 10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L, + 71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L, + 138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L, + 114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L, + 178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L, + 200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L, + 88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L, + 265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L, + 46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L, + 174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L, + 115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L, + 30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L, + 18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L) +) +ans = unique(dt, by="x") +test(301.01, data.table(x=unique(dt$x)), ans) ## OK +test(301.02, fdistinct(dt, on="x"), ans) ## force sort=TRUE for the moment + +## SQLite reference tests can be launched via + +### Rscript -e inst/tests/sqlite.Rraw.manual diff --git a/man/cbindlist.Rd b/man/cbindlist.Rd new file mode 100644 index 0000000000..50ac9fbf71 --- /dev/null +++ b/man/cbindlist.Rd @@ -0,0 +1,36 @@ +\name{cbindlist} +\alias{cbindlist} +\alias{cbind} +\alias{cbind.data.table} +\title{Column bind multiple data.tables} +\description{ + Column bind multiple \code{data.table}s. +} +\usage{ + cbindlist(l, copy=TRUE) +} +\arguments{ + \item{l}{ \code{list} of \code{data.table}s to merge. } + \item{copy}{ \code{logical}, decides if columns has to be copied into resulting object (default) or just referred. } +} +\details{ + Column bind only stacks input elements. Works like \code{\link{data.table}} function but takes \code{list} type on input. Zero-columns tables in \code{l} are ommited. Tables in \code{l} should have matching row count, recycling of rows is not yet implemented. Indices of the input tables are transferred to the resulting table, as well as a \emph{key} of the first keyed table. +} +\value{ + A new \code{data.table} based on the stacked objects. Eventually when \code{copy} is \code{FALSE}, then resulting object will share columns with \code{l} tables. +} +\note{ + If output object has any duplicate names, then key and indices are removed. +} +\seealso{ + \code{\link{data.table}}, \code{\link{rbindlist}}, \code{\link{mergelist}} +} +\examples{ +l = list( + d1 = data.table(x=1:3, v1=1L), + d2 = data.table(y=3:1, v2=2L), + d3 = data.table(z=2:4, v3=3L) +) +cbindlist(l) +} +\keyword{ data } \ No newline at end of file diff --git a/src/data.table.h b/src/data.table.h index c4e76e3eb7..1fe8bd68f9 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -285,6 +285,9 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); //negate.c SEXP notchin(SEXP x, SEXP table); +// mergelist.c +SEXP cbindlist(SEXP x, SEXP copyArg); + // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 // all arguments must be SEXP since they are called from R level diff --git a/src/init.c b/src/init.c index d707802820..e61c9b85e8 100644 --- a/src/init.c +++ b/src/init.c @@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = { {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, +{"Ccbindlist", (DL_FUNC) &cbindlist, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, {NULL, NULL, 0} }; diff --git a/src/mergelist.c b/src/mergelist.c new file mode 100644 index 0000000000..533424bf62 --- /dev/null +++ b/src/mergelist.c @@ -0,0 +1,81 @@ +#include "data.table.h" + +void mergeIndexAttrib(SEXP to, SEXP from) { + if (!isInteger(to) || LENGTH(to)!=0) + error("'to' must be integer() already"); // # nocov + if (isNull(from)) + return; + SEXP t = ATTRIB(to), f = ATTRIB(from); + if (isNull(f)) + return; + if (isNull(t)) + SET_ATTRIB(to, shallow_duplicate(f)); + else { + for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); + SETCDR(t, shallow_duplicate(f)); + } + return; +} + +SEXP cbindlist(SEXP x, SEXP copyArg) { + if (!isNewList(x) || INHERITS(x, char_dataframe)) + error("'x' must be a list"); + if (!IS_TRUE_OR_FALSE(copyArg)) + error("'copy' must be TRUE or FALSE"); + bool copy = (bool)LOGICAL(copyArg)[0]; + const bool verbose = GetVerbose(); + double tic = 0; + if (verbose) + tic = omp_get_wtime(); + int nx = length(x), nans = 0, nr = -1, *nnx = (int*)R_alloc(nx, sizeof(int)); + bool recycle = false; + for (int i=0; i