From 24c0d3ade2fbed41350d9bcd2136dbbdb63f55a3 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Wed, 27 Nov 2024 11:45:17 -0600
Subject: [PATCH] [Data] Clarify schema validation error (#48882)

```python
ray.data.range(1).groupby("does_not_exist").count().materialize()
```

**Before**
```
ValueError: The column 'does_not_exist' does not exist in the schema 'Column  Type
------  ----
id      int64'.
```

**After**
```
ValueError: You specified the column 'does_not_exist', but there's no such column in the dataset. The dataset has columns: {'id'}
```

---------

Signed-off-by: Balaji Veeramani <bveeramani@berkeley.edu>
Signed-off-by: hjiang <dentinyhao@gmail.com>
---
 .../_internal/planner/exchange/sort_task_spec.py  |  5 +++--
 python/ray/data/tests/test_execution_optimizer.py | 15 ++++-----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/python/ray/data/_internal/planner/exchange/sort_task_spec.py b/python/ray/data/_internal/planner/exchange/sort_task_spec.py
index 827c4a2c7a51d..7c67b3dbdefe0 100644
--- a/python/ray/data/_internal/planner/exchange/sort_task_spec.py
+++ b/python/ray/data/_internal/planner/exchange/sort_task_spec.py
@@ -81,8 +81,9 @@ def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]):
             for column in self._columns:
                 if column not in schema_names_set:
                     raise ValueError(
-                        "The column '{}' does not exist in the "
-                        "schema '{}'.".format(column, schema)
+                        f"You specified the column '{column}', but there's no such "
+                        "column in the dataset. The dataset has columns: "
+                        f"{schema_names_set}"
                     )
 
     @property
diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py
index d657ce1c9d982..af7af855b1871 100644
--- a/python/ray/data/tests/test_execution_optimizer.py
+++ b/python/ray/data/tests/test_execution_optimizer.py
@@ -1145,9 +1145,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
     assert extract_values("id", ds.sort("id").take_all()) == list(range(10))
 
     invalid_col_name = "invalid_column"
-    with pytest.raises(
-        ValueError, match=f"The column '{invalid_col_name}' does not exist"
-    ):
+    with pytest.raises(ValueError, match="there's no such column in the dataset"):
         ds.sort(invalid_col_name).take_all()
 
     ds_named = ray.data.from_items(
@@ -1165,10 +1163,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
     assert [d["col1"] for d in r1] == [7, 5, 3, 1]
     assert [d["col2"] for d in r2] == [8, 6, 4, 2]
 
-    with pytest.raises(
-        ValueError,
-        match=f"The column '{invalid_col_name}' does not exist in the schema",
-    ):
+    with pytest.raises(ValueError, match="there's no such column in the dataset"):
         ds_named.sort(invalid_col_name).take_all()
 
 
@@ -1279,9 +1274,7 @@ def test_aggregate_e2e(ray_start_regular_shared, use_push_based_shuffle):
 def test_aggregate_validate_keys(ray_start_regular_shared):
     ds = ray.data.range(10)
     invalid_col_name = "invalid_column"
-    with pytest.raises(
-        ValueError, match=f"The column '{invalid_col_name}' does not exist"
-    ):
+    with pytest.raises(ValueError):
         ds.groupby(invalid_col_name).count()
 
     ds_named = ray.data.from_items(
@@ -1308,7 +1301,7 @@ def test_aggregate_validate_keys(ray_start_regular_shared):
 
     with pytest.raises(
         ValueError,
-        match=f"The column '{invalid_col_name}' does not exist in the schema",
+        match="there's no such column in the dataset",
     ):
         ds_named.groupby(invalid_col_name).count()