Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement catalog filter for KedroDataCatalog #4449

Merged
merged 48 commits into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
c328842
Fixed catalog list for KedroDataCatalog
ElenaKhaustova Jan 22, 2025
0e4080b
Replaced solution
ElenaKhaustova Jan 22, 2025
bf09541
Updated solution and made it on the catalog side
ElenaKhaustova Jan 22, 2025
2ec76e9
Updated internal datasets access for KedroDataCatalog
ElenaKhaustova Jan 22, 2025
43cafca
Fixed __getattribute__
ElenaKhaustova Jan 22, 2025
4972a0c
Added test template
ElenaKhaustova Jan 23, 2025
424eea6
Updated solution and test
ElenaKhaustova Jan 23, 2025
bd36f24
Fixed linter
ElenaKhaustova Jan 23, 2025
b48a274
Merge branch 'main' into fix/4436-catalog-list
ElenaKhaustova Jan 23, 2025
93cbbb1
Updated release notes
ElenaKhaustova Jan 23, 2025
b02f5c0
Merge branch 'main' into fix/4436-catalog-list
ElenaKhaustova Jan 27, 2025
329a56c
Implemented a draft of filtering method
ElenaKhaustova Jan 27, 2025
9839bd8
Updated filter
ElenaKhaustova Jan 27, 2025
1fc083f
Fixed lint
ElenaKhaustova Jan 27, 2025
6fcd7f4
Updated old list method
ElenaKhaustova Jan 27, 2025
13a0b2b
Implemented tests for new filter
ElenaKhaustova Jan 27, 2025
ede073f
Merge branch 'main' into feature/3917-refactor-catalog-filter
ElenaKhaustova Jan 28, 2025
9aa62f7
Added tests for lazy datasets
ElenaKhaustova Jan 28, 2025
f4befbf
Added docstrings and usage examples
ElenaKhaustova Jan 28, 2025
392309f
Updated examples in the docstrings
ElenaKhaustova Jan 28, 2025
f80ebaa
Updated lazy dataset representation
ElenaKhaustova Jan 28, 2025
156a0d3
Updated unit tests
ElenaKhaustova Jan 28, 2025
cdd4c8b
Updated tests to reach coverage
ElenaKhaustova Jan 28, 2025
9ec854d
Updated release notes
ElenaKhaustova Jan 28, 2025
cbe02a3
Merge branch 'update-lazy-dataset-repr' into feature/3917-refactor-ca…
ElenaKhaustova Jan 28, 2025
80da4e8
Updated _LazyDataset representation
ElenaKhaustova Jan 28, 2025
b26302c
Updated release notes
ElenaKhaustova Jan 28, 2025
f07f0a2
Added default value to the docstrings
ElenaKhaustova Jan 29, 2025
3884abe
Renamed _compile_pattern to _compile_regex_pattern
ElenaKhaustova Jan 29, 2025
70f61a6
Merge branch 'main' into update-lazy-dataset-repr
ElenaKhaustova Jan 29, 2025
d9a0f1a
Updated release notes
ElenaKhaustova Jan 29, 2025
03e303d
Merge branch 'update-lazy-dataset-repr' into feature/3917-refactor-ca…
ElenaKhaustova Jan 29, 2025
690f105
Updated release notes
ElenaKhaustova Jan 29, 2025
cbeec99
Updated secrets baseline
ElenaKhaustova Jan 29, 2025
2c9891d
Merge branch 'main' into feature/3917-refactor-catalog-filter
ElenaKhaustova Jan 29, 2025
3c0f1e7
Merge branch 'main' into feature/3917-refactor-catalog-filter
ElenaKhaustova Feb 5, 2025
b94d6e1
Added by_type filter
ElenaKhaustova Feb 5, 2025
5fe2a9e
Fixed bugs found when testing
ElenaKhaustova Feb 5, 2025
7c3d558
Updated tests
ElenaKhaustova Feb 5, 2025
261e82f
Fixed linter
ElenaKhaustova Feb 5, 2025
c942e33
Updated docstring
ElenaKhaustova Feb 6, 2025
ecea26a
Updated release notes
ElenaKhaustova Feb 6, 2025
7a7e222
Merge branch 'main' into feature/3917-refactor-catalog-filter
ElenaKhaustova Feb 6, 2025
2a33f29
Updated function to accept compiled patterns
ElenaKhaustova Feb 6, 2025
e59b7d1
Updated unit tests
ElenaKhaustova Feb 6, 2025
ad90d9e
Removed bad regex test
ElenaKhaustova Feb 6, 2025
b512eb6
Fixed linter
ElenaKhaustova Feb 6, 2025
3e65577
Merge branch 'main' into feature/3917-refactor-catalog-filter
ElenaKhaustova Feb 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,9 @@
"filename": "tests/io/test_kedro_data_catalog.py",
"hashed_secret": "15dd2c9ccec914f1470b4dccb45789844e49cf70",
"is_verified": false,
"line_number": 499
"line_number": 558
}
]
},
"generated_at": "2025-01-27T18:47:13Z"
"generated_at": "2025-01-28T15:23:17Z"
}
2 changes: 2 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Improve OmegaConfigLoader performance.
* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base.
* Added support for `%load_ext kedro`.
* Implemented `KedroDataCatalog.filter()` to filter datasets by name and type.

## Bug fixes and other changes
* Added validation to ensure dataset versions consistency across catalog.
Expand All @@ -18,6 +19,7 @@
* Updated `Partitioned dataset lazy saving` docs page.
* Fixed `KedroDataCatalog` mutation after pipeline run.
* Made `KedroDataCatalog._datasets` compatible with `DataCatalog._datasets`.
* Updated `_LazyDataset` representation when printing `KedroDataCatalog`.

## Breaking changes to the API
## Documentation changes
Expand Down
95 changes: 86 additions & 9 deletions kedro/io/kedro_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
Version,
_validate_versions,
generate_timestamp,
parse_dataset_definition,
)
from kedro.io.memory_dataset import MemoryDataset, _is_memory_dataset
from kedro.utils import _format_rich, _has_rich_handler
Expand All @@ -48,7 +49,8 @@ def __init__(
self.save_version = save_version

def __repr__(self) -> str:
return f"{self.config.get('type', 'UnknownType')}"
class_type, _ = parse_dataset_definition(self.config)
return f"{class_type.__module__}.{class_type.__qualname__}"

def materialize(self) -> AbstractDataset:
return AbstractDataset.from_config(
Expand Down Expand Up @@ -549,11 +551,82 @@ def add(
)
self.__setitem__(ds_name, dataset)

def filter(
self,
name_regex: str | None = None,
name_regex_flags: int | re.RegexFlag = re.IGNORECASE,
type_regex: str | None = None,
ElenaKhaustova marked this conversation as resolved.
Show resolved Hide resolved
type_regex_flags: int | re.RegexFlag = 0,
) -> List[str]: # noqa: UP006
"""Filter dataset names registered in the catalog based on name and/or type.

This method allows filtering datasets by their names and/or types using optional
regular expression patterns. Each pattern can also support optional regex flags
for customization. If no filters are provided, all dataset names are returned.

Args:
name_regex: Optional regular expression to filter dataset names by name.
name_regex_flags: Optional regex flags for the name filter.
By default, IGNORECASE key is set.
type_regex: Optional regular expression to filter dataset names by their type.
The provided regex is matched against the full dataset type path, for example:
`kedro_datasets.pandas.parquet_dataset.ParquetDataset`.
type_regex_flags: Optional regex flags for the type filter.

Returns:
A list of dataset names that match the filtering criteria based on `name_regex`
and/or `type_regex`. If no filters are provided, all dataset names are returned.

Raises:
SyntaxError: If the provided regex patterns are invalid.

Example:
::

>>> catalog = KedroDataCatalog()
>>> # get datasets where the substring 'raw' is present
>>> raw_data = catalog.filter(name_regex='raw')
>>> # get datasets of a specific type
>>> csv_datasets = catalog.filter(type_regex='pandas.excel_dataset.ExcelDataset')
>>> # get datasets where names start with 'model_' and are of a specific type
>>> model_datasets = catalog.filter(
... name_regex='^model_',
... type_regex='ModelDataset',
... )
"""

# Apply name filter if specified
if name_regex:
pattern = _compile_pattern(name_regex, name_regex_flags)
filtered_names = [
ds_name for ds_name in self.__iter__() if pattern.search(ds_name)
]
else:
filtered_names = self.keys()

# Apply type filter if specified
if type_regex:
pattern = _compile_pattern(type_regex, type_regex_flags)
filtered_types = []
for ds_name in filtered_names:
# Retrieve the dataset type
if ds_name in self._lazy_datasets:
str_type = str(self._lazy_datasets[ds_name])
else:
class_type = type(self.__datasets[ds_name])
str_type = f"{class_type.__module__}.{class_type.__qualname__}"
# Match the dataset type against the type_regex
if pattern.search(str_type):
filtered_types.append(ds_name)

return filtered_types

return filtered_names

def list(
self, regex_search: str | None = None, regex_flags: int | re.RegexFlag = 0
) -> List[str]: # noqa: UP006
# TODO: rename depending on the solution for https://github.com/kedro-org/kedro/issues/3917
# TODO: make regex_search mandatory argument as we have catalog.keys() for listing all the datasets.
# TODO: remove when removing old catalog
"""List all dataset names registered in the catalog, optionally filtered by a regex pattern.

If a regex pattern is provided, only dataset names matching the pattern will be returned.
Expand Down Expand Up @@ -590,12 +663,7 @@ def list(
if not regex_flags:
regex_flags = re.IGNORECASE

try:
pattern = re.compile(regex_search, flags=regex_flags)
except re.error as exc:
raise SyntaxError(
f"Invalid regular expression provided: '{regex_search}'"
) from exc
pattern = _compile_pattern(regex_search, regex_flags)
return [ds_name for ds_name in self.__iter__() if pattern.search(ds_name)]

def save(self, name: str, data: Any) -> None:
Expand Down Expand Up @@ -745,3 +813,12 @@ def exists(self, name: str) -> bool:
except DatasetNotFoundError:
return False
return dataset.exists()


def _compile_pattern(regex: str, regex_flags: int | re.RegexFlag) -> re.Pattern:
ElenaKhaustova marked this conversation as resolved.
Show resolved Hide resolved
try:
pattern = re.compile(regex, flags=regex_flags)
except re.error as exc:
raise SyntaxError(f"Invalid regular expression provided: '{regex}'") from exc

return pattern
73 changes: 66 additions & 7 deletions tests/io/test_kedro_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ def conflicting_feed_dict():

@pytest.fixture
def multi_catalog():
csv = CSVDataset(filepath="abc.csv")
csv_1 = CSVDataset(filepath="abc.csv")
csv_2 = CSVDataset(filepath="def.csv")
parq = ParquetDataset(filepath="xyz.parq")
return KedroDataCatalog({"abc": csv, "xyz": parq})
return KedroDataCatalog({"abc": csv_1, "def": csv_2, "xyz": parq})


@pytest.fixture
Expand Down Expand Up @@ -159,8 +160,9 @@ def test_multi_catalog_list(self, multi_catalog):
[
("^a", ["abc"]),
("a|x", ["abc", "xyz"]),
("^(?!(a|x))", []),
("def", []),
("^(?!(a|d|x))", []),
("def", ["def"]),
("ghi", []),
("", []),
],
)
Expand All @@ -175,6 +177,61 @@ def test_multi_catalog_list_bad_regex(self, multi_catalog):
with pytest.raises(SyntaxError, match=pattern):
multi_catalog.list("((")

@pytest.mark.parametrize(
"name_regex,type_regex,expected",
[
("^a", None, ["abc"]),
("a|x", None, ["abc", "xyz"]),
("a|d|x", None, ["abc", "def", "xyz"]),
("a|d|x", "CSVDataset", ["abc", "def"]),
("a|d|x", "kedro_datasets", ["abc", "def", "xyz"]),
(None, "ParquetDataset", ["xyz"]),
("^(?!(a|d|x))", None, []),
("def", None, ["def"]),
(None, None, ["abc", "def", "xyz"]),
("a|d|x", "no_such_dataset", []),
],
)
def test_catalog_filter_regex(
self, multi_catalog, name_regex, type_regex, expected
):
"""Test that regex patterns filter materialized datasets accordingly"""
assert (
multi_catalog.filter(name_regex=name_regex, type_regex=type_regex)
== expected
)

@pytest.mark.parametrize(
"name_regex,type_regex,expected",
[
("b|m", None, ["boats", "materialized"]),
(None, None, ["boats", "cars", "materialized"]),
(None, "CSVDataset", ["boats", "cars"]),
(None, "ParquetDataset", ["materialized"]),
("b|c", "ParquetDataset", []),
],
)
def test_from_config_catalog_filter_regex(
self, data_catalog_from_config, name_regex, type_regex, expected
):
"""Test that regex patterns filter lazy and materialized datasets accordingly"""
data_catalog_from_config["materialized"] = ParquetDataset(filepath="xyz.parq")
assert (
data_catalog_from_config.filter(
name_regex=name_regex, type_regex=type_regex
)
== expected
)

def test_catalog_filter_bad_regex(self, multi_catalog):
"""Test that bad regex is caught accordingly"""
escaped_regex = r"\(\("
pattern = f"Invalid regular expression provided: '{escaped_regex}'"
with pytest.raises(SyntaxError, match=pattern):
multi_catalog.filter(name_regex="((")
with pytest.raises(SyntaxError, match=pattern):
multi_catalog.filter(type_regex="((")
ElenaKhaustova marked this conversation as resolved.
Show resolved Hide resolved

def test_eq(self, multi_catalog, data_catalog):
assert multi_catalog == multi_catalog.shallow_copy()
assert multi_catalog != data_catalog
Expand Down Expand Up @@ -266,12 +323,14 @@ def test_init_with_raw_data(self, dummy_dataframe, dataset):
assert isinstance(catalog["ds"], CSVDataset)
assert isinstance(catalog["df"], MemoryDataset)

def test_repr(self, data_catalog):
assert data_catalog.__repr__() == str(data_catalog)
def test_repr(self, data_catalog_from_config):
assert data_catalog_from_config.__repr__() == str(data_catalog_from_config)

def test_repr_no_type_found(self, data_catalog_from_config):
del data_catalog_from_config._lazy_datasets["boats"].config["type"]
assert data_catalog_from_config.__repr__() == str(data_catalog_from_config)
pattern = "'type' is missing from dataset catalog configuration"
with pytest.raises(DatasetError, match=re.escape(pattern)):
_ = str(data_catalog_from_config)

def test_missing_keys_from_load_versions(self, correct_config):
"""Test load versions include keys missing in the catalog"""
Expand Down