diff --git a/R/data.table.R b/R/data.table.R index e058514b0a..acbca23d58 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -520,13 +520,22 @@ replace_dot_alias = function(e) { if (!byjoin || nqbyjoin) { # Really, `anyDuplicated` in base is AWESOME! # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates - if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} - irows = if (allLen1) f__ else vecseq(f__,len__, - if (allow.cartesian || - notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). - !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) { - NULL # #742. If 'i' has no duplicates, ignore - } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). + if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} + irows = if (allLen1) f__ else { + join.many = getOption("datatable.join.many") # #914, default TRUE for backward compatibility + anyDups = if (!join.many && length(f__)==1L && len__==nrow(x)) { + NULL # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"] + } else if (!notjoin && ( # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). + !allow.cartesian || + !join.many)) + as.logical(anyDuplicated(f__, incomparables = c(0L, NA_integer_))) + limit = if (!is.null(anyDups) && anyDups) { # #742. If 'i' has no duplicates, ignore + if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.") + else if (!allow.cartesian && !notjoin) as.double(nrow(x)+nrow(i)) + else internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov + } + vecseq(f__, len__, limit) + } # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # Fix for #1092 and #1074 # TODO: implement better version of "any"/"all"/"which" to avoid diff --git a/R/mergelist.R b/R/mergelist.R index 9606ce0abb..435ee0a601 100644 --- a/R/mergelist.R +++ b/R/mergelist.R @@ -7,3 +7,126 @@ cbindlist = function(l, copy=TRUE) { setDT(ans) ans } + +# when 'on' is missing then use keys, used only for inner and full join +onkeys = function(x, y) { + if (is.null(x) && !is.null(y)) y + else if (!is.null(x) && is.null(y)) x + else if (!is.null(x) && !is.null(y)) { + if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key + else intersect(x, y) + } else NULL # nocov ## internal error is being called later in mergepair +} +someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) { + keep = colnamesInt(x, keep) + drop = colnamesInt(x, drop) + cols = colnamesInt(x, cols) + ans = union(keep, setdiff(cols, drop)) + if (!retain.order) return(ans) + intersect(colnamesInt(x, NULL), ans) +} +hasindex = function(x, by, retGrp=FALSE) { + index = attr(x, "index", TRUE) + if (is.null(index)) return(FALSE) + idx_name = paste0("__",by,collapse="") + idx = attr(index, idx_name, TRUE) + if (is.null(idx)) return(FALSE) + if (!retGrp) return(TRUE) + return(!is.null(attr(idx, "starts", TRUE))) +} + +# fdistinct applies mult='first|last' +# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE] +# it may not copy when copy=FALSE and x is unique by 'on' +fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) { + if (!perhaps.data.table(x)) + stopf("'x' must be data.table") + if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x))) + stopf("'on' must be character column names of 'x' argument") + mult = match.arg(mult) + if (is.null(cols)) + cols = seq_along(x) + else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols)) + stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument") + if (!isTRUEorFALSE(copy)) + stopf("'%s' must be TRUE or FALSE", "copy") + ## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last" + ## this short circuit will work after #4386 because it requires retGrp=T + #### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE) + sort = TRUE ## above line does not work for the moment, test 302.02 + o = forderv(x, by=on, sort=sort, retGrp=TRUE) + if (attr(o, "maxgrpn", TRUE) <= 1L) { + ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE) + if (copy) ans = copy(ans) + return(ans) + } + f = attr(o, "starts", exact=TRUE) + if (mult=="last") { + if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov + f = c(f[-1L]-1L, nrow(x)) ## last of each group + } + if (length(o)) f = o[f] + if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order + .Call(CsubsetDT, x, f, someCols(x, cols, keep=on)) +} + +# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow) +# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT +dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) { + nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_) + nomatch0 = identical(nomatch, 0L) + if (is.null(mult)) + mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error") + if (void && mult!="error") + internal_error("void must be used with mult='error'") # nocov + if (how=="cross") { ## short-circuit bmerge results only for cross join + if (length(on) || mult!="all" || !join.many) + stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE") + if (void) + internal_error("cross join must be used with void=FALSE") # nocov + ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer()) + } else { + if (!length(on)) + stopf("'on' must be non-zero length character vector") + if (mult=="all" && (how=="semi" || how=="anti")) + stopf("semi and anti joins must be used with mult!='all'") + icols = colnamesInt(i, on, check_dups=TRUE) + xcols = colnamesInt(x, on, check_dups=TRUE) + ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose) + if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line + return(invisible(NULL)) + } else if (how=="semi" || how=="anti") { ## semi and anti short-circuit + irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows + if (length(irows)==length(ans$lens)) irows = NULL + return(list(ans=ans, irows=irows)) + } else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check + !(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"] + anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_)) + ) + stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.") + } + + ## xrows, join-to + xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL) + if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)] + len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here + + ## irows, join-from + irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens) + len.i = if (is.null(irows)) nrow(i) else length(irows) + + if (length(ans$xo) && length(xrows)) + xrows = ans$xo[xrows] + len.x = length(xrows) + + if (len.i!=len.x) + internal_error("dtmerge out len.i != len.x") # nocov + + return(list(ans=ans, irows=irows, xrows=xrows)) +} + +# Previously, we had a custom C implementation here, which is ~2x faster, +# but this is fast enough we don't bother maintaining a new routine. +# Hopefully in the future rep() can recognize the ALTREP and use that, too. +seqexp = function(x) rep(seq_along(x), x) +perhaps.data.table = function(x) .Call(CperhapsDataTableR, x) diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw index 9e6835cb71..a35c4f4103 100644 --- a/inst/tests/mergelist.Rraw +++ b/inst/tests/mergelist.Rraw @@ -6,10 +6,49 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } else { require(data.table) test = data.table:::test + perhaps.data.table = data.table:::perhaps.data.table + hasindex = data.table:::hasindex + fdistinct = data.table:::fdistinct + forderv = data.table:::forderv } addresses = function(x) vapply(x, address, "") +# internal helpers + +test(1.01, perhaps.data.table(list())) +test(1.02, perhaps.data.table(list(a=1:2))) +test(1.03, perhaps.data.table(list(a=1:2, b=1:2))) +test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE) + +test(2.01, fdistinct(list(x=c(1L,1:2), b=1:2), on="x", mult="last"), error="must be data.table") +test(2.02, fdistinct(data.table(x=c(1L,1:2)), on="z", mult="last"), error="must be character column names of") +test(2.03, fdistinct(data.table(x=c(1L,1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of") +test(2.04, fdistinct(data.table(x=c(1L,1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE") +d = data.table(x=1:2, y=1:2) +test(2.05, ans<-fdistinct(d, on="x", mult="last"), d) +test(2.06, intersect(addresses(ans), addresses(d)), character()) +test(2.07, ans<-fdistinct(d, on="x", mult="last", copy=FALSE), d) +test(2.08, addresses(ans), addresses(d)) +d = data.table(x=c(2:1,2L), y=1:3) +test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2)) +test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3)) +setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386 +test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2)) + +test(3.01, hasindex(d, "x")) +test(3.02, hasindex(d, "x", retGrp=TRUE)) +setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x")) ## retGrp=F index #4386 +test(3.03, hasindex(d, "x")) +test(3.04, !hasindex(d, "x", retGrp=TRUE)) +setattr(d, "index", NULL) +test(3.05, !hasindex(d, "x")) +test(3.06, !hasindex(d, "x", retGrp=TRUE)) +setattr(d, "index", integer()) +test(3.07, !hasindex(d, "x")) +test(3.08, !hasindex(d, "x", retGrp=TRUE)) +rm(d) + # cbindlist l = list( @@ -57,7 +96,7 @@ attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")] test(13.01, class(d), "list") setDT(d) test(13.02, key(d), "x") -# test(13.03, hasindex(d, "y") && hasindex(d, "z")) +test(13.03, hasindex(d, "y") && hasindex(d, "z")) l = list( data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5), data.table(id4=5:1, id5=1:5, v2=1:5), @@ -70,3 +109,36 @@ ans = cbindlist(l) test(13.04, key(ans), "id1") test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9")) test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib + +## fdistinct, another round + +dt = data.table(x = +c(74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L, + 49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L, + 180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L, + 52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L, + 146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L, + 28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L, + 132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L, + 108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L, + 68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L, + 167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L, + 269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L, + 137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L, + 10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L, + 71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L, + 138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L, + 114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L, + 178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L, + 200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L, + 88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L, + 265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L, + 46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L, + 174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L, + 115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L, + 30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L, + 18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L) +) +ans = unique(dt, by="x") +test(301.01, data.table(x=unique(dt$x)), ans) ## OK +test(301.02, fdistinct(dt, on="x"), ans) ## force sort=TRUE for the moment diff --git a/src/init.c b/src/init.c index 7189bb9da6..1ee4243145 100644 --- a/src/init.c +++ b/src/init.c @@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = { {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, {"Ccbindlist", (DL_FUNC) &cbindlist, -1}, +{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, {NULL, NULL, 0} };