ray-project · bveeramani · Nov 27, 2024 · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -81,8 +81,9 @@ def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]):
             for column in self._columns:
                 if column not in schema_names_set:
                     raise ValueError(
-                        "The column '{}' does not exist in the "
-                        "schema '{}'.".format(column, schema)
+                        f"You specified the column '{column}', but there's no such "
+                        "column in the dataset. The dataset has columns: "
+                        f"{schema_names_set}"
                     )
 
     @property

@@ -1145,9 +1145,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
     assert extract_values("id", ds.sort("id").take_all()) == list(range(10))
 
     invalid_col_name = "invalid_column"
-    with pytest.raises(
-        ValueError, match=f"The column '{invalid_col_name}' does not exist"
-    ):
+    with pytest.raises(ValueError, match="there's no such column in the dataset"):
         ds.sort(invalid_col_name).take_all()
 
     ds_named = ray.data.from_items(
@@ -1165,10 +1163,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
     assert [d["col1"] for d in r1] == [7, 5, 3, 1]
     assert [d["col2"] for d in r2] == [8, 6, 4, 2]
 
-    with pytest.raises(
-        ValueError,
-        match=f"The column '{invalid_col_name}' does not exist in the schema",
-    ):
+    with pytest.raises(ValueError, match="there's no such column in the dataset"):
         ds_named.sort(invalid_col_name).take_all()
 
 
@@ -1279,9 +1274,7 @@ def test_aggregate_e2e(ray_start_regular_shared, use_push_based_shuffle):
 def test_aggregate_validate_keys(ray_start_regular_shared):
     ds = ray.data.range(10)
     invalid_col_name = "invalid_column"
-    with pytest.raises(
-        ValueError, match=f"The column '{invalid_col_name}' does not exist"
-    ):
+    with pytest.raises(ValueError):
         ds.groupby(invalid_col_name).count()
 
     ds_named = ray.data.from_items(
@@ -1308,7 +1301,7 @@ def test_aggregate_validate_keys(ray_start_regular_shared):
 
     with pytest.raises(
         ValueError,
-        match=f"The column '{invalid_col_name}' does not exist in the schema",
+        match="there's no such column in the dataset",
     ):
         ds_named.groupby(invalid_col_name).count()