LSSTDESC · stuartmcalpine · Dec 6, 2023 · Nov 29, 2023 · Nov 29, 2023 · Dec 1, 2023
diff --git a/docs/source/tutorial_notebooks/getting_started.ipynb b/docs/source/tutorial_notebooks/getting_started.ipynb
@@ -169,7 +169,9 @@
     "\n",
     "### The relative path\n",
     "\n",
-    "Datasets are registered at the data registry shared space under a relative path. For those interested, the eventual full path for the dataset will be `<root_dir>/<owner_type>/<owner>/<relative_path>`. The relative path is one of the two required parameters you must specify when registering a dataset (in the example here our relative path is `nersc_tutorial/my_desc_dataset`).\n",
+    "Datasets are registered at the data registry shared space under a path relative to the root directory. For those interested, the eventual full path for the dataset will be `<root_dir>/<owner_type>/<owner>/<relative_path>`. This means that the combination of `relative_path`, `owner` and `owner_type` must be unique within the registry, and therefore cannot already be taken when you register a new dataset (an exception to this is if you allow your datasets to be overwritable, see below). \n",
+    "\n",
+    "The relative path is one of the two required parameters you must specify when registering a dataset (in the example here our relative path is `nersc_tutorial/my_desc_dataset`).\n",
     "\n",
     "### The version string\n",
     "\n",
@@ -186,7 +188,9 @@
     "\n",
     "### Overwriting datasets\n",
     "\n",
-    "By default datasets are not overwritable. In those scenarios you will need to choose a combination of `relative_path`, `owner` and `owner_type` that is not already taken in the database. For our example we have set it so that the dataset can be overwritten so that it does not raise an error through multiple tests. Note that when a dataset has `is_overwritable=true`, the data in the shared space will be overwritten with each registration, but the entry in the data registry database is never lost (a new unique entry is created each time, and the 'old' entries will obtain `true` for their `is_overwritten` row).\n",
+    "By default, datasets in the data registry, once registered, are not overwritable. You can change this behavior by setting `is_overwritable=true` when registering your datasets. If `is_overwritable=true` on one of your previous datasets, you can register a new dataset with the same combination of `relative_path`, `owner` and `owner_type` as before (be warned that any previous data stored under this path will be deleted first). \n",
+    "\n",
+    "Note that whilst the data in the shared space will be overwritten with each registration when `is_overwritable=true`, the original entries in the data registry database are never lost (a new unique entry is created each time, and the 'old' entries will obtain `true` for their `is_overwritten` row).\n",
     "\n",
     "### Copying the data\n",
     "\n",
@@ -362,7 +366,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar.py
@@ -1,7 +1,6 @@
 import time
 import os
 from datetime import datetime
-from shutil import copyfile, copytree
 
 # from sqlalchemy import MetaData, Table, Column, insert, text,
 from sqlalchemy import update, select
@@ -10,7 +9,7 @@
 from dataregistry.db_basic import add_table_row
 from dataregistry.registrar_util import _form_dataset_path, get_directory_info
 from dataregistry.registrar_util import _parse_version_string, _bump_version
-from dataregistry.registrar_util import _name_from_relpath
+from dataregistry.registrar_util import _name_from_relpath, _copy_data
 from dataregistry.db_basic import TableMetadata
 
 # from dataregistry.exceptions import *
@@ -204,12 +203,7 @@ def _handle_data(self, relative_path, old_location, owner, owner_type, verbose):
                     f"Copying {num_files} files ({total_size/1024/1024:.2f} Mb)...",
                     end="",
                 )
-            if dataset_organization == "file":
-                # Create any intervening directories
-                os.makedirs(os.path.dirname(dest), exist_ok=True)
-                copyfile(old_location, dest)
-            elif dataset_organization == "directory":
-                copytree(old_location, dest, copy_function=copyfile)
+            _copy_data(dataset_organization, old_location, dest)
             if verbose:
                 print(f"took {time.time()-tic:.2f}")
 

diff --git a/src/dataregistry/registrar_util.py b/src/dataregistry/registrar_util.py
@@ -1,13 +1,15 @@
 import os
 import re
 from sqlalchemy import MetaData, Table, Column, text, select
+from shutil import copyfile, copytree, rmtree
 
 __all__ = [
     "_parse_version_string",
     "_bump_version",
     "_form_dataset_path",
     "get_directory_info",
     "_name_from_relpath",
+    "_copy_data",
 ]
 VERSION_SEPARATOR = "."
 _nonneg_int_re = "0|[1-9][0-9]*"
@@ -207,3 +209,43 @@ def _name_from_relpath(relative_path):
         name = base
 
     return name
+
+def _copy_data(dataset_organization, source, dest):
+    """
+    Copy data from one location to another (for ingesting directories and files
+    into the `root_dir` shared space.
+
+    Note prior to this, in `_handle_data`, it has already been check that
+    `source` exists, so we do not have to check again.
+
+    Parameters
+    ----------
+    dataset_organization : str
+        The dataset organization, either "file" or "directory"
+    source : str
+        Path of source file or directory
+    dest : str
+        Destination we are copying to
+    """
+
+    # Copy a single file
+    if dataset_organization == "file":
+        # Create any intervening directories
+        os.makedirs(os.path.dirname(dest), exist_ok=True)
+
+        # Copy the file
+        copyfile(source, dest)
+
+    # Copy a single directory (and subdirectories)
+    elif dataset_organization == "directory":
+        # If the directory already exists, need to delete it first
+        if os.path.isdir(dest):
+            rmtree(dest)
+
+        # Copy the directory
+        copytree(source, dest, copy_function=copyfile)
+
+    else:
+        raise ValueError(f"{dataset_organization} is bad dataset_organization")
+
+
diff --git a/tests/unit_tests/test_rutil_copy_data.py b/tests/unit_tests/test_rutil_copy_data.py
@@ -0,0 +1,60 @@
+import pytest
+import os
+from dataregistry.registrar_util import _copy_data
+
+
+@pytest.fixture
+def dummy_file(tmp_path):
+    """Create some dummy (temporary) files and directories"""
+
+    # Root temp source dir for files
+    tmp_src_dir = tmp_path / "source"
+    tmp_src_dir.mkdir()
+
+    # Make a dummy file
+    p = tmp_src_dir / "dummy_standalone_file.txt"
+    p.write_text("dummy stand alone file")
+
+    # Make a dummy directory with a file in it
+    p = tmp_src_dir / "tmpdir" / "tmpdir2"
+    p.mkdir(parents=True)
+    q = p / "dummy_file_within_folder.txt"
+    q.write_text("dummy file within folder")
+
+    # Root temp dest dir to copy into
+    tmp_dest_dir = tmp_path / "dest"
+    tmp_dest_dir.mkdir()
+
+    return tmp_src_dir, tmp_dest_dir
+
+
+def test_copy_data(dummy_file):
+    """
+    Test copying files and directories
+
+    Each test is looped twice, the 2nd emulating overwriting a dataset.
+    """
+
+    tmp_src_dir, tmp_dest_dir = dummy_file
+
+    # Copy a single file from source to destination
+    for i in range(2):
+        _copy_data(
+            "file",
+            tmp_src_dir / "dummy_standalone_file.txt",
+            tmp_dest_dir / "dummy_standalone_file.txt",
+        )
+
+        p = tmp_dest_dir / "dummy_standalone_file.txt"
+        assert os.path.isfile(p)
+        assert p.read_text() == "dummy stand alone file"
+
+    # Copy a single directory from source to destination
+    for i in range(2):
+        _copy_data("directory", tmp_src_dir / "tmpdir", tmp_dest_dir / "tmpdir")
+
+        assert os.path.isdir(tmp_dest_dir / "tmpdir")
+        assert os.path.isdir(tmp_dest_dir / "tmpdir" / "tmpdir2")
+        p = tmp_dest_dir / "tmpdir" / "tmpdir2" / "dummy_file_within_folder.txt"
+        assert os.path.isfile(p)
+        assert p.read_text() == "dummy file within folder"