Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring the auto generated relative paths #159

Merged
merged 7 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
## Version 1.0.3

Some changes to the way the `relative_path` is automatically generated from the
`name` and `version`.

- All automatically generated `relative_paths` are placed in a top level
`.gen_paths` directory, e.g.,
`<root_dir>/<schema>/<owner_type>/<owner>/.gen_paths`. This is to prevent
clashes with user specified `relative_paths`.
- Single files no longer have their filename changed when automatically
generating the `relative_path`. Instead a directory containing the `name` and
`version` is created, and the file is copied there. This preserves the
filname suffix in the relative path.

## Version 1.0.2

Expand Down
25 changes: 22 additions & 3 deletions docs/source/tutorial_notebooks/datasets_deeper_look.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,9 @@
"\n",
"The files and directories of registered datasets are stored under a path relative to the root directory (`root_dir`), which, by default, is a shared space at NERSC.\n",
"\n",
"By default, the `relative_path` is constructed from the `name` and `version`, in the format `relative_path=<name>/<version>. However, one can also manually select the relative_path during registration, for example"
"By default, when not manually specified, the `relative_path` is constructed from the `name` and `version`, in the format `relative_path=.gen_paths/<name>_<version>/`. \n",
"\n",
"One can manually select the `relative_path` during registration if they explicitly care about where the data is located relative to the `root_dir`, for example"
]
},
{
Expand Down Expand Up @@ -208,9 +210,26 @@
"source": [
"will register a dataset under the `relative_path` of `nersc_tutorial/my_desc_dataset`.\n",
"\n",
"For those interested, the eventual full path for the dataset will be `<root_dir>/<schema>/<owner_type>/<owner>/<relative_path>`. Naturally, the `relative_path` you select cannot already be taken by another dataset (an error will be raised in this case).\n",
"If the registered dataset was a single file, the `relative_path` will be the explicit (relative) pathname to that file, e.g., `.gen_paths/mydataset_1.0.0/myfile.txt` or `my/manual/path/myfile.txt`. If the registered dataset was a directory, the `relative_path` is the pathname to the directory containing the dataset contents.\n",
"\n",
"For those interested, the eventual full path for the dataset will be `<root_dir>/<schema>/<owner_type>/<owner>/<relative_path>`. Naturally, the `relative_path` you select cannot already be taken by another dataset (an error will be raised in this case), and any manually specified `relative_path` cannot start with `.gen_paths` as this directory is reserved for autogenerated `relative_path`s.\n",
"\n",
"When you overwrite a previous dataset entry using the `replace()` function, the original `relative_path` at registration (automatically generated or manual) will be used.\n",
"\n",
"One can construct the full absolute path to a dataregistry file using the `get_dataset_absolute_path()` helper function, e.g.,"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f193290d-892a-47b6-8e91-8e94a10d506f",
"metadata": {},
"outputs": [],
"source": [
"# Find the full absolute path to a dataset using the dataset id\n",
"absolute_path = datareg.Query.get_dataset_absolute_path(dataset_id)\n",
"\n",
"When you overwrite a previous dataset entry using the `replace()` function, the original `relative_path` at registration (automatically generated or manual) will be used."
"print(f\"The absolute path for {dataset_id} is {absolute_path}\""
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion src/dataregistry/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.2"
__version__ = "1.0.3"
27 changes: 25 additions & 2 deletions src/dataregistry/registrar/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from .dataset_util import set_dataset_status, get_dataset_status

_ILLEGAL_NAME_CHAR = ["$", "*", "&", "/", "?", "\\", " "]

_ILLEGAL_RELPATH_CHAR = ["$", "*", "&", "?", "\\", " "]

class DatasetTable(BaseTable):
def __init__(self, db_connection, root_dir, owner, owner_type, execution_table):
Expand Down Expand Up @@ -83,6 +83,29 @@ def _validate_register_inputs(
"External datasets require either a url or contact_email"
)

# Make sure the user passed `relative_path` is legal
# Only needed for `register` function, `replace` has no `relative_path`
# argument as this cannot be changed from the original `register`
if "relative_path" in kwargs_dict.keys():

if kwargs_dict["relative_path"] is not None:
if type(kwargs_dict["relative_path"]) != str:
raise ValueError("Relative path is not a valid string")

# Relative path cannot start with a "/"
if kwargs_dict["relative_path"][0] == "/":
raise ValueError("Relative path cannot start with a '/'")

# Make sure `relative_path` is legal (i.e., no illegal characters)
for i_char in _ILLEGAL_RELPATH_CHAR:
if i_char in kwargs_dict["relative_path"]:
raise ValueError(f"Cannot have character {i_char} in relative path")

# The '.gen_paths' directory is reserved for autogenerated paths
if kwargs_dict["relative_path"].startswith(".gen_paths"):
raise ValueError("Can't start relative path with '.gen_paths', "
"this is reserved for auto-generated `relative_paths")

# Assign the `owner_type`
if kwargs_dict["owner_type"] is None:
if self._owner_type is not None:
Expand Down Expand Up @@ -409,7 +432,7 @@ def register(
# If `relative_path` not passed, automatically generate it
if kwargs_dict["relative_path"] is None:
kwargs_dict["relative_path"] = _relpath_from_name(
name, kwargs_dict["version_string"]
name, kwargs_dict["version_string"], kwargs_dict["old_location"]
)

# Make sure the relative_path in the `root_dir` is avaliable
Expand Down
22 changes: 20 additions & 2 deletions src/dataregistry/registrar/registrar_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,22 +327,40 @@ def _compute_checksum(file_path):

raise Exception(e)

def _relpath_from_name(name, version):
def _relpath_from_name(name, version, old_location):
"""
Construct a relative path from the name and version of a dataset.
We use this when the `relative_path` is not explicitly defined.

Every automatically generated `relative_path` is prefixed with
`.gen_paths/`, meaning that all automatically generated `relative_paths` go
into this top level folder. This is to prevent clashes with user specified
`relative_path`'s.

The auto-generated `relative_path` will be a directory that contains the
name and version, which is where the ingested data (from `old_location`)
will eventually reside. If the data being ingested is a single file, the
`relative_path` will be the full path to the file within the registry, not
just the directory that contains the file.

Parameters
----------
name : str
Dataset name
version : str
Dataset version
old_location : str
Path the data is coming from (needed to parse filename)

Returns
-------
relative_path : str
Automatically generated `relative_path`
"""

return f"{name}_{version}"
# For single files, scrape the filename and add it to the `relative_path`
if (old_location is not None) and os.path.isfile(old_location):
return os.path.join(".gen_paths", f"{name}_{version}", os.path.basename(old_location))
else:
# For directories, only need the autogenerated directory name
return os.path.join(".gen_paths", f"{name}_{version}")
4 changes: 2 additions & 2 deletions tests/end_to_end_tests/test_register_dataset_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_register_dataset_defaults(dummy_file):
assert results["owner"][0] == os.getenv("USER")
assert results["owner_type"][0] == "user"
assert results["description"][0] == None
assert results["relative_path"][0] == f"{_NAME}_0.0.1"
assert results["relative_path"][0] == f".gen_paths/{_NAME}_0.0.1"
assert results["data_org"][0] == "dummy"
assert results["execution_id"][0] >= 0
assert results["dataset_id"][0] >= 0
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_dataset_bumping(dummy_file, v_type, ans):
# Check the result
assert results["dataset.name"][0] == _NAME
assert results["dataset.version_string"][0] == ans
assert results["dataset.relative_path"][0] == f"{_NAME}_{ans}"
assert results["dataset.relative_path"][0] == f".gen_paths/{_NAME}_{ans}"


@pytest.mark.parametrize("owner_type", ["user", "group", "project"])
Expand Down
34 changes: 34 additions & 0 deletions tests/end_to_end_tests/test_register_dataset_real_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,40 @@ def test_registering_bad_relative_path(dummy_file, link):
relative_path=f"test/register/bad/relpath/{link}",
)

@pytest.mark.parametrize("link", ["file1.txt", "directory1"])
def test_registering_bad_relative_path_2(dummy_file, link):
"""
Make sure we cannot register a datataset to a relative path that is using
the auto generated .gen_paths directry.
"""

# Establish connection to database
tmp_src_dir, tmp_root_dir = dummy_file
datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING)

data_path = str(tmp_src_dir / link)

# Test going into the '.gen_paths' folder (not allowed)
with pytest.raises(ValueError, match="Can't start relative path with '.gen_paths'"):
d_id = _insert_dataset_entry(
datareg,
f"DESC:datasets:test_registering_bad_relative_path_3_{link}",
"0.0.1",
old_location=data_path,
location_type="dataregistry",
relative_path=f".gen_paths/test/register/bad/relpath/{link}",
)

# Test registering a file or directory as '.gen_paths' (not allowed)
with pytest.raises(ValueError, match="Can't start relative path with '.gen_paths'"):
d_id = _insert_dataset_entry(
datareg,
f"DESC:datasets:test_registering_bad_relative_path_4_{link}",
"0.0.1",
old_location=data_path,
location_type="dataregistry",
relative_path=f".gen_paths",
)

@pytest.mark.parametrize("link", ["file1.txt", "directory1"])
def test_registering_deleted_relative_path(dummy_file, link):
Expand Down
6 changes: 3 additions & 3 deletions tests/unit_tests/test_registrar_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,17 @@ def test_read_file(tmpdir, nchars, max_config_length, ans):
@pytest.mark.parametrize(
"name,version_string,ans",
[
("mydataset", "1.1.1", "mydataset_1.1.1"),
("mydataset", "1.1.1", ".gen_paths/mydataset_1.1.1"),
],
)
def test_relpath_from_name(name, version_string, ans):
def test_relpath_from_name(name, version_string, ans, old_location=None):
"""
Test dataset path construction
Datasets should come back with the format:
<root_dir>/<owner_type>/<owner>/<relative_path>
"""

tmp = _relpath_from_name(name, version_string)
tmp = _relpath_from_name(name, version_string, old_location)
assert tmp == ans

@pytest.mark.parametrize(
Expand Down
Loading