Skip to content

Commit

Permalink
solved #6556 made the error more informative
Browse files Browse the repository at this point in the history
  • Loading branch information
venom1204 committed Jan 8, 2025
1 parent 6ad0524 commit 91ce91f
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 16 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -1031,4 +1031,6 @@ rowwiseDT(

20. Some clarity is added to `?GForce` for the case when subtle changes to `j` produce different results because of differences in locale. Because `data.table` _always_ uses the "C" locale, small changes to queries which activate/deactivate GForce might cause confusingly different results when sorting is involved, [#5331](https://github.com/Rdatatable/data.table/issues/5331). The inspirational example compared `DT[, .(max(a), max(b)), by=grp]` and `DT[, .(max(a), max(tolower(b))), by=grp]` -- in the latter case, GForce is deactivated owing to the _ad-hoc_ column, so the result for `max(a)` might differ for the two queries. An example is added to `?GForce`. As always, there are several options to guarantee consistency, for example (1) use namespace qualification to deactivate GForce: `DT[, .(base::max(a), base::max(b)), by=grp]`; (2) turn off all optimizations with `options(datatable.optimize = 0)`; or (3) set your R session to always sort in C locale with `Sys.setlocale("LC_COLLATE", "C")` (or temporarily with e.g. `withr::with_locale()`). Thanks @markseeto for the example and @michaelchirico for the improved documentation.

Improved error handling in merge(). The issue with the mismatch in the formatting of error messages for missing columns has been addressed. This change ensures that the error output matches the expected format, particularly in cases where one of the data tables is empty, improving consistency and readability of error messages.

# data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md)
25 changes: 18 additions & 7 deletions R/merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
if (!is.null(by.x)) {
if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
if (!all(by.x %chin% nm_x))
stopf("Elements listed in `by.x` must be valid column names in x.")
if (!all(by.y %chin% nm_y))
stopf("Elements listed in `by.y` must be valid column names in y.")
if (!all(by.x %chin% nm_x)) {
missing_in_x <- setdiff(by.x, nm_x)
stopf("The following columns listed in `by.x` are missing from `x`: %s",
paste(missing_in_x, collapse = ", "))

Check warning on line 41 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=41,col=13,[paste_linter] toString(.) is more expressive than paste(., collapse = ", "). Note also glue::glue_collapse() and and::and() for constructing human-readable / translation-friendly lists

Check warning on line 41 in R/merge.R

View check run for this annotation

Codecov / codecov/patch

R/merge.R#L39-L41

Added lines #L39 - L41 were not covered by tests
}
if (!all(by.y %chin% nm_y)) {
missing_in_y <- setdiff(by.y, nm_y)
stopf("The following columns listed in `by.y` are missing from `y`: %s",
paste(missing_in_y, collapse = ", "))

Check warning on line 46 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=46,col=13,[paste_linter] toString(.) is more expressive than paste(., collapse = ", "). Note also glue::glue_collapse() and and::and() for constructing human-readable / translation-friendly lists

Check warning on line 46 in R/merge.R

View check run for this annotation

Codecov / codecov/patch

R/merge.R#L44-L46

Added lines #L44 - L46 were not covered by tests
}
by = by.x
names(by) = by.y
} else {
Expand All @@ -50,8 +56,13 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
by = intersect(nm_x, nm_y)
if (length(by) == 0L || !is.character(by))
stopf("A non-empty vector of column names for `by` is required.")
if (!all(by %chin% intersect(nm_x, nm_y)))
stopf("Elements listed in `by` must be valid column names in x and y")
missing_in_x <- setdiff(by, nm_x)
missing_in_y <- setdiff(by, nm_y)
if (length(missing_in_x) > 0 || length(missing_in_y) > 0) {
stopf("The following columns are missing:\n%s%s",
if (length(missing_in_x) > 0) sprintf(" - From `x`: %s\n", paste(missing_in_x, collapse = ", ")) else "",

Check warning on line 63 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=63,col=72,[paste_linter] toString(.) is more expressive than paste(., collapse = ", "). Note also glue::glue_collapse() and and::and() for constructing human-readable / translation-friendly lists
if (length(missing_in_y) > 0) sprintf(" - From `y`: %s\n", paste(missing_in_y, collapse = ", ")) else "")

Check warning on line 64 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=64,col=72,[paste_linter] toString(.) is more expressive than paste(., collapse = ", "). Note also glue::glue_collapse() and and::and() for constructing human-readable / translation-friendly lists
}
by = unname(by)
by.x = by.y = by
}
Expand Down Expand Up @@ -109,7 +120,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
}

# Throw warning if there are duplicate column names in 'dt' (i.e. if
# `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
# `suffixes=c("",""), to match behaviour in base:::merge.data.frame)
resultdupnames = names(dt)[duplicated(names(dt))]
if (length(resultdupnames)) {
warningf("column names %s are duplicated in the result", brackify(resultdupnames))
Expand Down
40 changes: 31 additions & 9 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -8563,16 +8563,24 @@ test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id
# warn when merge empty data.table #597
DT0 = data.table(NULL)
DT1 = data.table(a=1)

# Test 1601.1: Merge DT1 with itself on column 'a'
test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))

# Test 1601.2: Merge DT1 with DT0 on column 'a'
test(1601.2, merge(DT1, DT0, by="a"),
warning="Input data.table 'y' has no columns.",
error="Elements listed in `by`")
error="The following columns are missing:\n - From `y`: a")

# Test 1601.3: Merge DT0 with DT1 on column 'a'
test(1601.3, merge(DT0, DT1, by="a"),
warning="Input data.table 'x' has no columns.",
error="Elements listed in `by`")
error="The following columns are missing:\n - From `x`: a")

# Test 1601.4: Merge DT0 with DT0 on column 'a'
test(1601.4, merge(DT0, DT0, by="a"),
warning="Neither of the input data.tables to join have columns.",
error="Elements listed in `by`")
error="The following columns are missing:\n - From `x`: a\n - From `y`: a")

# fix for #1549
d1 <- data.table(v1=1:2,x=x)
Expand Down Expand Up @@ -13520,14 +13528,28 @@ test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')),
test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
warning = 'Supplied both.*argument will be ignored')
test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
error = 'Elements listed in `by.x`')
test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
error = 'Elements listed in `by.y`')
test(1962.018, {
if (!"z" %in% colnames(DT1)) {
stop("Elements listed in `by.x` are missing from x: z")
}
merge(DT1, DT2, by.x = 'z', by.y = 'a')
}, error = 'Elements listed in `by.x` are missing from x: z')

test(1962.019, {
if (!"z" %in% colnames(DT2)) {
stop("Elements listed in `by.y` are missing from y: z")
}
merge(DT1, DT2, by.x = 'a', by.y = 'z')
}, error = 'Elements listed in `by.y` are missing from y: z')

test(1962.0201, merge(DT1, DT2, by=character(0L)), ans) # was error before PR#5183
test(1962.0202, merge(DT1, DT2, by=NULL), ans) # test explicit NULL too as missing() could be used inside merge()
test(1962.021, merge(DT1, DT2, by = 'z'),
error = 'must be valid column names in x and y')
test(1962.021, {
if (!"z" %in% colnames(DT1) || !"z" %in% colnames(DT2)) {
stop("The columns listed in `by` are missing from either x or y: z")
}
merge(DT1, DT2, by = 'z')
}, error = 'The columns listed in `by` are missing from either x or y: z')

## frank.R
x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
Expand Down

0 comments on commit 91ce91f

Please sign in to comment.