From c31557db1442724a8fc4f84d5610fe91af74519f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 26 Dec 2023 22:44:30 +0100 Subject: [PATCH 1/4] add regression fix --- NEWS.md | 2 +- R/merge.R | 13 ++++++++++++- inst/tests/tests.Rraw | 5 +++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4ca025d22..0a953262f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -204,7 +204,7 @@ # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` -30. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.`, [#5444](https://github.com/Rdatatable/data.table/issues/5444). Thanks to @sindribaldur for testing dev and filing a bug report which was fixed before release. +30. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.`, [#5444](https://github.com/Rdatatable/data.table/issues/5444). Thanks to @sindribaldur, @dcaseykc, @fox34, @adrian-quintario and @berg-michael for testing dev and filing a bug report which was fixed before release. ```R DT1 diff --git a/R/merge.R b/R/merge.R index cbc9b9e29..a578c26e1 100644 --- a/R/merge.R +++ b/R/merge.R @@ -96,8 +96,19 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed # Perhaps not very commonly used, so not a huge deal that the join is redone here. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] + # TO DO: replace by following once #5446 is merged + # if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) if (length(missingyidx)) { - dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE) + yy = y[missingyidx] + othercolsx = setdiff(nm_x, by) + if (length(othercolsx)) { + tmp = rep.int(NA_integer_, length(missingyidx)) + # TO DO: use set() here instead.. + yy = cbind(yy, x[tmp, othercolsx, with = FALSE]) + } + # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist + # takes care of #24 without having to save names. This is how it should be, IMHO. + dt = rbind(dt, yy, use.names=FALSE) } } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2e49bb5f6..174af7120 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1921,6 +1921,11 @@ test(631.5, DT3, data.table(a=c(2), total=c(5), key="a")) # .. nrow(y)=1, i subset y with 1 and match with x test(631.6, merge(DT1,DT4,all.y=TRUE), data.table(a=c(3),total.x=c(1),total.y=c(1),key="a")) test(631.7, DT4, data.table(a=c(3), total=c(1), key="a")) +# merge columns with different attributes #5309 +x = data.table(a=1L, b=as.IDate(16801)) +y = data.table(a=2L, b=NA) +test(631.8, merge(x,y,by="a",all=TRUE), data.table(a=c(1L,2L), b.x=as.IDate(c(16801,NA)), b.y=NA, key="a")) +test(631.9, merge(y,x,by="a",all=TRUE), data.table(a=c(1L,2L), b.x=NA, b.y=as.IDate(c(16801,NA)), key="a")) test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,1),total.y=c(NA,5,1,NA,2),key="a")) test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a)) From 4838e6b05a09ee51f5ec349e66d86248676f1bae Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 27 Dec 2023 12:27:32 +0100 Subject: [PATCH 2/4] add tests from #5309 --- inst/tests/tests.Rraw | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 174af7120..52ee2f14e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14347,6 +14347,14 @@ test(2003.5, rbindlist(list(data.table(a=1:2), data.table(b=3:4, c=5:6)), fill=T # rbindlist segfault with fill=TRUE and usenames=FALSE #5444 test(2003.6, rbindlist(list(list(1), list(2,3)), fill=TRUE, use.names=FALSE), data.table(c(1,2), c(NA, 3))) test(2003.7, rbindlist(list(list(1), list(2,factor(3))), fill=TRUE, use.names=FALSE), data.table(c(1,2), factor(c(NA, 3)))) +# rbind with different attributes #5309 +x=data.table(a=as.Date(NA)) +y=data.table(a=as.Date('2021-10-05'), b=as.POSIXct("2021-10-06 13:58:00 UTC")) +ans=data.table(a=as.Date(c(NA_character_, '2021-10-05')), b=as.POSIXct(c(NA_character_, "2021-10-06 13:58:00 UTC"))) +test(2003.81, rbind(x, y, fill=TRUE, use.names=TRUE), ans) +test(2003.82, rbind(y, x, fill=TRUE, use.names=TRUE), ans[2:1,]) +test(2003.83, rbind(x, y, fill=TRUE, use.names=FALSE), ans) +test(2003.84, rbind(y, x, fill=TRUE, use.names=FALSE), ans[2:1,]) # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" From e410cd0cecc50c8cccc832041a480dccea105460 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 27 Dec 2023 13:19:49 +0100 Subject: [PATCH 3/4] added comment about NA rectangle --- R/merge.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/merge.R b/R/merge.R index a578c26e1..7e98c048a 100644 --- a/R/merge.R +++ b/R/merge.R @@ -102,6 +102,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL yy = y[missingyidx] othercolsx = setdiff(nm_x, by) if (length(othercolsx)) { + # create NA rectangle with correct types of x to cbind to y tmp = rep.int(NA_integer_, length(missingyidx)) # TO DO: use set() here instead.. yy = cbind(yy, x[tmp, othercolsx, with = FALSE]) From 123ac11cd7dcc16094218cf16479f3ddf198b50b Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 27 Dec 2023 06:35:57 -0800 Subject: [PATCH 4/4] emphasize subtle part about attributes too --- R/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/merge.R b/R/merge.R index 7e98c048a..8062d91fc 100644 --- a/R/merge.R +++ b/R/merge.R @@ -102,7 +102,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL yy = y[missingyidx] othercolsx = setdiff(nm_x, by) if (length(othercolsx)) { - # create NA rectangle with correct types of x to cbind to y + # create NA rectangle with correct types and attributes of x to cbind to y tmp = rep.int(NA_integer_, length(missingyidx)) # TO DO: use set() here instead.. yy = cbind(yy, x[tmp, othercolsx, with = FALSE])