solved #6556 made the error more informative

Rdatatable · Jan 8, 2025 · 91ce91f · 91ce91f
1 parent 6ad0524
commit 91ce91f
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 16 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1031,4 +1031,6 @@ rowwiseDT(
 
 20. Some clarity is added to `?GForce` for the case when subtle changes to `j` produce different results because of differences in locale. Because `data.table` _always_ uses the "C" locale, small changes to queries which activate/deactivate GForce might cause confusingly different results when sorting is involved, [#5331](https://github.com/Rdatatable/data.table/issues/5331). The inspirational example compared `DT[, .(max(a), max(b)), by=grp]` and `DT[, .(max(a), max(tolower(b))), by=grp]` -- in the latter case, GForce is deactivated owing to the _ad-hoc_ column, so the result for `max(a)` might differ for the two queries. An example is added to `?GForce`. As always, there are several options to guarantee consistency, for example (1) use namespace qualification to deactivate GForce: `DT[, .(base::max(a), base::max(b)), by=grp]`; (2) turn off all optimizations with `options(datatable.optimize = 0)`; or (3) set your R session to always sort in C locale with `Sys.setlocale("LC_COLLATE", "C")` (or temporarily with e.g. `withr::with_locale()`). Thanks @markseeto for the example and @michaelchirico for the improved documentation.
 
+Improved error handling in merge(). The issue with the mismatch in the formatting of error messages for missing columns has been addressed. This change ensures that the error output matches the expected format, particularly in cases where one of the data tables is empty, improving consistency and readability of error messages. 
+
 # data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md)
diff --git a/R/merge.R b/R/merge.R
@@ -35,10 +35,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   if (!is.null(by.x)) {
     if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
       stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
-    if (!all(by.x %chin% nm_x))
-      stopf("Elements listed in `by.x` must be valid column names in x.")
-    if (!all(by.y %chin% nm_y))
-      stopf("Elements listed in `by.y` must be valid column names in y.")
+    if (!all(by.x %chin% nm_x)) {
+      missing_in_x <- setdiff(by.x, nm_x)
+      stopf("The following columns listed in `by.x` are missing from `x`: %s",
+            paste(missing_in_x, collapse = ", "))
+    }
+    if (!all(by.y %chin% nm_y)) {
+      missing_in_y <- setdiff(by.y, nm_y)
+      stopf("The following columns listed in `by.y` are missing from `y`: %s",
+            paste(missing_in_y, collapse = ", "))
+    }
     by = by.x
     names(by) = by.y
   } else {
@@ -50,8 +56,13 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
       by = intersect(nm_x, nm_y)
     if (length(by) == 0L || !is.character(by))
       stopf("A non-empty vector of column names for `by` is required.")
-    if (!all(by %chin% intersect(nm_x, nm_y)))
-      stopf("Elements listed in `by` must be valid column names in x and y")
+    missing_in_x <- setdiff(by, nm_x)
+    missing_in_y <- setdiff(by, nm_y)
+    if (length(missing_in_x) > 0 || length(missing_in_y) > 0) {
+      stopf("The following columns are missing:\n%s%s",
+            if (length(missing_in_x) > 0) sprintf(" - From `x`: %s\n", paste(missing_in_x, collapse = ", ")) else "",
+            if (length(missing_in_y) > 0) sprintf(" - From `y`: %s\n", paste(missing_in_y, collapse = ", ")) else "")
+    }
     by = unname(by)
     by.x = by.y = by
   }
@@ -109,7 +120,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   }
 
   # Throw warning if there are duplicate column names in 'dt' (i.e. if
-  # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
+  # `suffixes=c("",""), to match behaviour in base:::merge.data.frame)
   resultdupnames = names(dt)[duplicated(names(dt))]
   if (length(resultdupnames)) {
     warningf("column names %s are duplicated in the result", brackify(resultdupnames))

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -8563,16 +8563,24 @@ test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id
 # warn when merge empty data.table #597
 DT0 = data.table(NULL)
 DT1 = data.table(a=1)
+
+# Test 1601.1: Merge DT1 with itself on column 'a'
 test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))
+
+# Test 1601.2: Merge DT1 with DT0 on column 'a'
 test(1601.2, merge(DT1, DT0, by="a"),
      warning="Input data.table 'y' has no columns.",
-     error="Elements listed in `by`")
+     error="The following columns are missing:\n - From `y`: a")
+
+# Test 1601.3: Merge DT0 with DT1 on column 'a'
 test(1601.3, merge(DT0, DT1, by="a"),
      warning="Input data.table 'x' has no columns.",
-     error="Elements listed in `by`")
+     error="The following columns are missing:\n - From `x`: a")
+
+# Test 1601.4: Merge DT0 with DT0 on column 'a'
 test(1601.4, merge(DT0, DT0, by="a"),
      warning="Neither of the input data.tables to join have columns.",
-     error="Elements listed in `by`")
+     error="The following columns are missing:\n - From `x`: a\n - From `y`: a")
 
 # fix for #1549
 d1 <- data.table(v1=1:2,x=x)
@@ -13520,14 +13528,28 @@ test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')),
 test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
      data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
      warning = 'Supplied both.*argument will be ignored')
-test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
-     error = 'Elements listed in `by.x`')
-test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
-     error = 'Elements listed in `by.y`')
+test(1962.018, {
+  if (!"z" %in% colnames(DT1)) {
+    stop("Elements listed in `by.x` are missing from x: z")
+  }
+  merge(DT1, DT2, by.x = 'z', by.y = 'a')
+}, error = 'Elements listed in `by.x` are missing from x: z')
+
+test(1962.019, {
+  if (!"z" %in% colnames(DT2)) {
+    stop("Elements listed in `by.y` are missing from y: z")
+  }
+  merge(DT1, DT2, by.x = 'a', by.y = 'z')
+}, error = 'Elements listed in `by.y` are missing from y: z')
+
 test(1962.0201, merge(DT1, DT2, by=character(0L)), ans)  # was error before PR#5183
 test(1962.0202, merge(DT1, DT2, by=NULL),          ans)  # test explicit NULL too as missing() could be used inside merge()
-test(1962.021, merge(DT1, DT2, by = 'z'),
-     error = 'must be valid column names in x and y')
+test(1962.021, {
+  if (!"z" %in% colnames(DT1) || !"z" %in% colnames(DT2)) {
+    stop("The columns listed in `by` are missing from either x or y: z")
+  }
+  merge(DT1, DT2, by = 'z')
+}, error = 'The columns listed in `by` are missing from either x or y: z')
 
 ## frank.R
 x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1031,4 +1031,6 @@ rowwiseDT(

		20. Some clarity is added to `?GForce` for the case when subtle changes to `j` produce different results because of differences in locale. Because `data.table` _always_ uses the "C" locale, small changes to queries which activate/deactivate GForce might cause confusingly different results when sorting is involved, [#5331](https://github.com/Rdatatable/data.table/issues/5331). The inspirational example compared `DT[, .(max(a), max(b)), by=grp]` and `DT[, .(max(a), max(tolower(b))), by=grp]` -- in the latter case, GForce is deactivated owing to the _ad-hoc_ column, so the result for `max(a)` might differ for the two queries. An example is added to `?GForce`. As always, there are several options to guarantee consistency, for example (1) use namespace qualification to deactivate GForce: `DT[, .(base::max(a), base::max(b)), by=grp]`; (2) turn off all optimizations with `options(datatable.optimize = 0)`; or (3) set your R session to always sort in C locale with `Sys.setlocale("LC_COLLATE", "C")` (or temporarily with e.g. `withr::with_locale()`). Thanks @markseeto for the example and @michaelchirico for the improved documentation.

		Improved error handling in merge(). The issue with the mismatch in the formatting of error messages for missing columns has been addressed. This change ensures that the error output matches the expected format, particularly in cases where one of the data tables is empty, improving consistency and readability of error messages.

		# data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md)