From 24c0d3ade2fbed41350d9bcd2136dbbdb63f55a3 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 27 Nov 2024 11:45:17 -0600 Subject: [PATCH] [Data] Clarify schema validation error (#48882) ```python ray.data.range(1).groupby("does_not_exist").count().materialize() ``` **Before** ``` ValueError: The column 'does_not_exist' does not exist in the schema 'Column Type ------ ---- id int64'. ``` **After** ``` ValueError: You specified the column 'does_not_exist', but there's no such column in the dataset. The dataset has columns: {'id'} ``` --------- Signed-off-by: Balaji Veeramani Signed-off-by: hjiang --- .../_internal/planner/exchange/sort_task_spec.py | 5 +++-- python/ray/data/tests/test_execution_optimizer.py | 15 ++++----------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/python/ray/data/_internal/planner/exchange/sort_task_spec.py b/python/ray/data/_internal/planner/exchange/sort_task_spec.py index 827c4a2c7a51d..7c67b3dbdefe0 100644 --- a/python/ray/data/_internal/planner/exchange/sort_task_spec.py +++ b/python/ray/data/_internal/planner/exchange/sort_task_spec.py @@ -81,8 +81,9 @@ def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]): for column in self._columns: if column not in schema_names_set: raise ValueError( - "The column '{}' does not exist in the " - "schema '{}'.".format(column, schema) + f"You specified the column '{column}', but there's no such " + "column in the dataset. The dataset has columns: " + f"{schema_names_set}" ) @property diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py index d657ce1c9d982..af7af855b1871 100644 --- a/python/ray/data/tests/test_execution_optimizer.py +++ b/python/ray/data/tests/test_execution_optimizer.py @@ -1145,9 +1145,7 @@ def test_sort_validate_keys(ray_start_regular_shared): assert extract_values("id", ds.sort("id").take_all()) == list(range(10)) invalid_col_name = "invalid_column" - with pytest.raises( - ValueError, match=f"The column '{invalid_col_name}' does not exist" - ): + with pytest.raises(ValueError, match="there's no such column in the dataset"): ds.sort(invalid_col_name).take_all() ds_named = ray.data.from_items( @@ -1165,10 +1163,7 @@ def test_sort_validate_keys(ray_start_regular_shared): assert [d["col1"] for d in r1] == [7, 5, 3, 1] assert [d["col2"] for d in r2] == [8, 6, 4, 2] - with pytest.raises( - ValueError, - match=f"The column '{invalid_col_name}' does not exist in the schema", - ): + with pytest.raises(ValueError, match="there's no such column in the dataset"): ds_named.sort(invalid_col_name).take_all() @@ -1279,9 +1274,7 @@ def test_aggregate_e2e(ray_start_regular_shared, use_push_based_shuffle): def test_aggregate_validate_keys(ray_start_regular_shared): ds = ray.data.range(10) invalid_col_name = "invalid_column" - with pytest.raises( - ValueError, match=f"The column '{invalid_col_name}' does not exist" - ): + with pytest.raises(ValueError): ds.groupby(invalid_col_name).count() ds_named = ray.data.from_items( @@ -1308,7 +1301,7 @@ def test_aggregate_validate_keys(ray_start_regular_shared): with pytest.raises( ValueError, - match=f"The column '{invalid_col_name}' does not exist in the schema", + match="there's no such column in the dataset", ): ds_named.groupby(invalid_col_name).count()