Skip to content

Commit

Permalink
Merge pull request #159 from LSSTDESC/u/stuart/refactor_autogenerated…
Browse files Browse the repository at this point in the history
…_name

Refactoring the auto generated relative paths
  • Loading branch information
stuartmcalpine authored Oct 24, 2024
2 parents 9d2bb02 + 43ac2fd commit 81949eb
Show file tree
Hide file tree
Showing 8 changed files with 120 additions and 13 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
## Version 1.0.3

Some changes to the way the `relative_path` is automatically generated from the
`name` and `version`.

- All automatically generated `relative_paths` are placed in a top level
`.gen_paths` directory, e.g.,
`<root_dir>/<schema>/<owner_type>/<owner>/.gen_paths`. This is to prevent
clashes with user specified `relative_paths`.
- Single files no longer have their filename changed when automatically
generating the `relative_path`. Instead a directory containing the `name` and
`version` is created, and the file is copied there. This preserves the
filname suffix in the relative path.

## Version 1.0.2

Expand Down
25 changes: 22 additions & 3 deletions docs/source/tutorial_notebooks/datasets_deeper_look.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,9 @@
"\n",
"The files and directories of registered datasets are stored under a path relative to the root directory (`root_dir`), which, by default, is a shared space at NERSC.\n",
"\n",
"By default, the `relative_path` is constructed from the `name` and `version`, in the format `relative_path=<name>/<version>. However, one can also manually select the relative_path during registration, for example"
"By default, when not manually specified, the `relative_path` is constructed from the `name` and `version`, in the format `relative_path=.gen_paths/<name>_<version>/`. \n",
"\n",
"One can manually select the `relative_path` during registration if they explicitly care about where the data is located relative to the `root_dir`, for example"
]
},
{
Expand Down Expand Up @@ -208,9 +210,26 @@
"source": [
"will register a dataset under the `relative_path` of `nersc_tutorial/my_desc_dataset`.\n",
"\n",
"For those interested, the eventual full path for the dataset will be `<root_dir>/<schema>/<owner_type>/<owner>/<relative_path>`. Naturally, the `relative_path` you select cannot already be taken by another dataset (an error will be raised in this case).\n",
"If the registered dataset was a single file, the `relative_path` will be the explicit (relative) pathname to that file, e.g., `.gen_paths/mydataset_1.0.0/myfile.txt` or `my/manual/path/myfile.txt`. If the registered dataset was a directory, the `relative_path` is the pathname to the directory containing the dataset contents.\n",
"\n",
"For those interested, the eventual full path for the dataset will be `<root_dir>/<schema>/<owner_type>/<owner>/<relative_path>`. Naturally, the `relative_path` you select cannot already be taken by another dataset (an error will be raised in this case), and any manually specified `relative_path` cannot start with `.gen_paths` as this directory is reserved for autogenerated `relative_path`s.\n",
"\n",
"When you overwrite a previous dataset entry using the `replace()` function, the original `relative_path` at registration (automatically generated or manual) will be used.\n",
"\n",
"One can construct the full absolute path to a dataregistry file using the `get_dataset_absolute_path()` helper function, e.g.,"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f193290d-892a-47b6-8e91-8e94a10d506f",
"metadata": {},
"outputs": [],
"source": [
"# Find the full absolute path to a dataset using the dataset id\n",
"absolute_path = datareg.Query.get_dataset_absolute_path(dataset_id)\n",
"\n",
"When you overwrite a previous dataset entry using the `replace()` function, the original `relative_path` at registration (automatically generated or manual) will be used."
"print(f\"The absolute path for {dataset_id} is {absolute_path}\""
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion src/dataregistry/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.2"
__version__ = "1.0.3"
27 changes: 25 additions & 2 deletions src/dataregistry/registrar/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from .dataset_util import set_dataset_status, get_dataset_status

_ILLEGAL_NAME_CHAR = ["$", "*", "&", "/", "?", "\\", " "]

_ILLEGAL_RELPATH_CHAR = ["$", "*", "&", "?", "\\", " "]

class DatasetTable(BaseTable):
def __init__(self, db_connection, root_dir, owner, owner_type, execution_table):
Expand Down Expand Up @@ -83,6 +83,29 @@ def _validate_register_inputs(
"External datasets require either a url or contact_email"
)

# Make sure the user passed `relative_path` is legal
# Only needed for `register` function, `replace` has no `relative_path`
# argument as this cannot be changed from the original `register`
if "relative_path" in kwargs_dict.keys():

if kwargs_dict["relative_path"] is not None:
if type(kwargs_dict["relative_path"]) != str:
raise ValueError("Relative path is not a valid string")

# Relative path cannot start with a "/"
if kwargs_dict["relative_path"][0] == "/":
raise ValueError("Relative path cannot start with a '/'")

# Make sure `relative_path` is legal (i.e., no illegal characters)
for i_char in _ILLEGAL_RELPATH_CHAR:
if i_char in kwargs_dict["relative_path"]:
raise ValueError(f"Cannot have character {i_char} in relative path")

# The '.gen_paths' directory is reserved for autogenerated paths
if kwargs_dict["relative_path"].startswith(".gen_paths"):
raise ValueError("Can't start relative path with '.gen_paths', "
"this is reserved for auto-generated `relative_paths")

# Assign the `owner_type`
if kwargs_dict["owner_type"] is None:
if self._owner_type is not None:
Expand Down Expand Up @@ -409,7 +432,7 @@ def register(
# If `relative_path` not passed, automatically generate it
if kwargs_dict["relative_path"] is None:
kwargs_dict["relative_path"] = _relpath_from_name(
name, kwargs_dict["version_string"]
name, kwargs_dict["version_string"], kwargs_dict["old_location"]
)

# Make sure the relative_path in the `root_dir` is avaliable
Expand Down
22 changes: 20 additions & 2 deletions src/dataregistry/registrar/registrar_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,22 +327,40 @@ def _compute_checksum(file_path):

raise Exception(e)

def _relpath_from_name(name, version):
def _relpath_from_name(name, version, old_location):
"""
Construct a relative path from the name and version of a dataset.
We use this when the `relative_path` is not explicitly defined.
Every automatically generated `relative_path` is prefixed with
`.gen_paths/`, meaning that all automatically generated `relative_paths` go
into this top level folder. This is to prevent clashes with user specified
`relative_path`'s.
The auto-generated `relative_path` will be a directory that contains the
name and version, which is where the ingested data (from `old_location`)
will eventually reside. If the data being ingested is a single file, the
`relative_path` will be the full path to the file within the registry, not
just the directory that contains the file.
Parameters
----------
name : str
Dataset name
version : str
Dataset version
old_location : str
Path the data is coming from (needed to parse filename)
Returns
-------
relative_path : str
Automatically generated `relative_path`
"""

return f"{name}_{version}"
# For single files, scrape the filename and add it to the `relative_path`
if (old_location is not None) and os.path.isfile(old_location):
return os.path.join(".gen_paths", f"{name}_{version}", os.path.basename(old_location))
else:
# For directories, only need the autogenerated directory name
return os.path.join(".gen_paths", f"{name}_{version}")
4 changes: 2 additions & 2 deletions tests/end_to_end_tests/test_register_dataset_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_register_dataset_defaults(dummy_file):
assert results["owner"][0] == os.getenv("USER")
assert results["owner_type"][0] == "user"
assert results["description"][0] == None
assert results["relative_path"][0] == f"{_NAME}_0.0.1"
assert results["relative_path"][0] == f".gen_paths/{_NAME}_0.0.1"
assert results["data_org"][0] == "dummy"
assert results["execution_id"][0] >= 0
assert results["dataset_id"][0] >= 0
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_dataset_bumping(dummy_file, v_type, ans):
# Check the result
assert results["dataset.name"][0] == _NAME
assert results["dataset.version_string"][0] == ans
assert results["dataset.relative_path"][0] == f"{_NAME}_{ans}"
assert results["dataset.relative_path"][0] == f".gen_paths/{_NAME}_{ans}"


@pytest.mark.parametrize("owner_type", ["user", "group", "project"])
Expand Down
34 changes: 34 additions & 0 deletions tests/end_to_end_tests/test_register_dataset_real_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,40 @@ def test_registering_bad_relative_path(dummy_file, link):
relative_path=f"test/register/bad/relpath/{link}",
)

@pytest.mark.parametrize("link", ["file1.txt", "directory1"])
def test_registering_bad_relative_path_2(dummy_file, link):
"""
Make sure we cannot register a datataset to a relative path that is using
the auto generated .gen_paths directry.
"""

# Establish connection to database
tmp_src_dir, tmp_root_dir = dummy_file
datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING)

data_path = str(tmp_src_dir / link)

# Test going into the '.gen_paths' folder (not allowed)
with pytest.raises(ValueError, match="Can't start relative path with '.gen_paths'"):
d_id = _insert_dataset_entry(
datareg,
f"DESC:datasets:test_registering_bad_relative_path_3_{link}",
"0.0.1",
old_location=data_path,
location_type="dataregistry",
relative_path=f".gen_paths/test/register/bad/relpath/{link}",
)

# Test registering a file or directory as '.gen_paths' (not allowed)
with pytest.raises(ValueError, match="Can't start relative path with '.gen_paths'"):
d_id = _insert_dataset_entry(
datareg,
f"DESC:datasets:test_registering_bad_relative_path_4_{link}",
"0.0.1",
old_location=data_path,
location_type="dataregistry",
relative_path=f".gen_paths",
)

@pytest.mark.parametrize("link", ["file1.txt", "directory1"])
def test_registering_deleted_relative_path(dummy_file, link):
Expand Down
6 changes: 3 additions & 3 deletions tests/unit_tests/test_registrar_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,17 @@ def test_read_file(tmpdir, nchars, max_config_length, ans):
@pytest.mark.parametrize(
"name,version_string,ans",
[
("mydataset", "1.1.1", "mydataset_1.1.1"),
("mydataset", "1.1.1", ".gen_paths/mydataset_1.1.1"),
],
)
def test_relpath_from_name(name, version_string, ans):
def test_relpath_from_name(name, version_string, ans, old_location=None):
"""
Test dataset path construction
Datasets should come back with the format:
<root_dir>/<owner_type>/<owner>/<relative_path>
"""

tmp = _relpath_from_name(name, version_string)
tmp = _relpath_from_name(name, version_string, old_location)
assert tmp == ans

@pytest.mark.parametrize(
Expand Down

0 comments on commit 81949eb

Please sign in to comment.