Skip to content

Commit

Permalink
rbindlist support fill=TRUE with use.names=FALSE and use it in merge.…
Browse files Browse the repository at this point in the history
…R ToDo of #678 (#5263)
  • Loading branch information
ben-schwen authored Nov 23, 2021
1 parent d8dc315 commit 4922384
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 15 deletions.
45 changes: 45 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,51 @@
# v1.9.6 18.5400 19.1800 21.5100 20.6900 23.4200 29.040 100
# v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100
```

31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.`

```R
DT1
# A B
# <int> <int>
# 1: 1 5
# 2: 2 6

DT2
# foo
# <int>
# 1: 3
# 2: 4

rbind(DT1, DT2, fill=TRUE) # no change
# A B foo
# <int> <int> <int>
# 1: 1 5 NA
# 2: 2 6 NA
# 3: NA NA 3
# 4: NA NA 4

rbind(DT1, DT2, fill=TRUE, use.names=FALSE)

# was:
# A B foo
# <int> <int> <int>
# 1: 1 5 NA
# 2: 2 6 NA
# 3: NA NA 3
# 4: NA NA 4
# Warning message:
# In rbindlist(l, use.names, fill, idcol) :
# use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.

# now:
# A B
# <int> <int>
# 1: 1 5
# 2: 2 6
# 3: 3 NA
# 4: 4 NA
```

## BUG FIXES

Expand Down
11 changes: 1 addition & 10 deletions R/merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
# Perhaps not very commonly used, so not a huge deal that the join is redone here.
missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian]
if (length(missingyidx)) {
yy = y[missingyidx]
othercolsx = setdiff(nm_x, by)
if (length(othercolsx)) {
tmp = rep.int(NA_integer_, length(missingyidx))
# TO DO: use set() here instead..
yy = cbind(yy, x[tmp, othercolsx, with = FALSE])
}
# empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
# takes care of #24 without having to save names. This is how it should be, IMHO.
dt = rbind(dt, yy, use.names=FALSE)
dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE)
}
}
# X[Y] syntax puts JIS i columns at the end, merge likes them alongside i.
Expand Down
19 changes: 17 additions & 2 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -1863,6 +1863,8 @@ test(628.2, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list
# Test merge with common names and all.y=TRUE, #2011
DT1 = data.table(a=c(1,3,4,5), total=c(2,1,3,1), key="a")
DT2 = data.table(a=c(2,3,5), total=c(5,1,2), key="a")
DT3 = data.table(a=c(2), total=c(5), key="a")
DT4 = data.table(a=c(3), total=c(1), key="a")
# 629+630 worked before anyway. 631+632 test the bug fix.
adf=as.data.frame
adt=as.data.table
Expand All @@ -1875,6 +1877,16 @@ test(630.1, merge(DT1,DT2,all.x=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a"

test(631, merge(DT1,DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=c(NA,1,1),total.y=c(5,1,2),key="a"))
test(631.1, merge(DT1,DT2,all.y=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.y=TRUE)),a))
# ensure merge(x,y,all.y) does not alter input y ...
# .. i subset y with 1:nrow(y)
test(631.2, merge(DT1[c(1,3)],DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=NA_real_,total.y=c(5,1,2),key="a"))
test(631.3, DT2, data.table(a=c(2,3,5), total=c(5,1,2), key="a"))
# .. nrow(y)=1, i subset y with 1 and no match with x
test(631.4, merge(DT1,DT3,all.y=TRUE), data.table(a=c(2),total.x=NA_real_,total.y=c(5),key="a"))
test(631.5, DT3, data.table(a=c(2), total=c(5), key="a"))
# .. nrow(y)=1, i subset y with 1 and match with x
test(631.6, merge(DT1,DT4,all.y=TRUE), data.table(a=c(3),total.x=c(1),total.y=c(1),key="a"))
test(631.7, DT4, data.table(a=c(3), total=c(1), key="a"))

test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,1),total.y=c(NA,5,1,NA,2),key="a"))
test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a))
Expand Down Expand Up @@ -14577,8 +14589,11 @@ test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logica
test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]")
test(2003.2, rbindlist(list(), fill=1), error="fill= should be TRUE or FALSE")
test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE),
data.table(a=c(1:2,NA,NA), b=c(NA,NA,3:4)),
warning="use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE")
data.table(a=c(1:4)))
test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE),
data.table(a=c(1:4), c=INT(5,6,NA,NA)))
test(2003.5, rbindlist(list(data.table(a=1:2), data.table(b=3:4, c=5:6)), fill=TRUE, use.names=FALSE),
data.table(a=c(1:4), V1=INT(NA,NA,5,6)))

# chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111
x1 = "fa\xE7ile"
Expand Down
2 changes: 1 addition & 1 deletion man/rbindlist.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL)
\arguments{
\item{l}{ A list containing \code{data.table}, \code{data.frame} or \code{list} objects. \code{\dots} is the same but you pass the objects by name separately. }
\item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.}
\item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}. When \code{TRUE}, \code{use.names} is set to \code{TRUE}.}
\item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}.}
\item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.}
}
\details{
Expand Down
3 changes: 1 addition & 2 deletions src/rbindlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists."));
Rboolean usenames = LOGICAL(usenamesArg)[0];
const bool fill = LOGICAL(fillArg)[0];
if (fill && usenames!=TRUE) {
if (usenames==FALSE) warning(_("use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.")); // else no warning if usenames==NA (default)
if (fill && usenames==NA_LOGICAL) {
usenames=TRUE;
}
const bool idcol = !isNull(idcolArg);
Expand Down

0 comments on commit 4922384

Please sign in to comment.