From c6a04d2e278d91c16caa02bd76969c5e88235830 Mon Sep 17 00:00:00 2001 From: Stuart Mcalpine Date: Sun, 15 Sep 2024 11:04:00 -0700 Subject: [PATCH] Update tutorial notebooks --- .../datasets_deeper_look.ipynb | 60 ++++++++---- .../production_schema.ipynb | 60 +++++------- .../tutorial_notebooks/query_datasets.ipynb | 70 ++++++++++---- .../register_datasets.ipynb | 93 ++++++++++++++----- 4 files changed, 184 insertions(+), 99 deletions(-) diff --git a/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb b/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb index e9344c4f..06537796 100644 --- a/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb +++ b/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb @@ -39,8 +39,20 @@ }, "outputs": [], "source": [ + "# Come up with a random owner name to avoid clashes\n", + "from random import randint\n", + "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n", + "\n", "import dataregistry\n", - "print(\"Working with dataregistry version:\", dataregistry.__version__)" + "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4c2f92bf-9048-421e-b896-292eb00542c8", + "metadata": {}, + "source": [ + "**Note** that running some of the cells below may fail, especially if ran multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration." ] }, { @@ -55,13 +67,15 @@ "cell_type": "code", "execution_count": null, "id": "72eabcd0-b05e-4e87-9ed1-6450ac196b05", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from dataregistry import DataRegistry\n", "\n", - "# Establish connection to database (using defaults)\n", - "datareg = DataRegistry()" + "# Establish connection to the tutorial schema\n", + "datareg = DataRegistry(schema=\"tutorial_working\", owner=OWNER)" ] }, { @@ -78,7 +92,9 @@ "cell_type": "code", "execution_count": null, "id": "560b857c-7d94-44ad-9637-0b107cd42259", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "print(datareg.Registrar.dataset.get_keywords())" @@ -98,7 +114,9 @@ "cell_type": "code", "execution_count": null, "id": "44581049-1d15-44f0-b1ed-34cff6cdb45a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new dataset entry with keywords.\n", @@ -132,7 +150,9 @@ "cell_type": "code", "execution_count": null, "id": "09478b87-7d5a-4814-85c7-49f90e0db45d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# List of keywords to add to dataset\n", @@ -160,22 +180,24 @@ "\n", "The files and directories of registered datasets are stored under a path relative to the root directory (`root_dir`), which, by default, is a shared space at NERSC.\n", "\n", - "By default, the relative_path is constructed from the `name`, `version` and `version_suffix` (if there is one), in the format `relative_path=/_`. However, one can also manually select the relative_path during registration, for example" + "By default, the `relative_path` is constructed from the `name`, `version` and `version_suffix` (if there is one), in the format `relative_path=/_`. However, one can also manually select the relative_path during registration, for example" ] }, { "cell_type": "code", "execution_count": null, "id": "5bc0d5b6-f50a-4646-bc1b-7d9e829e91bc", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new entry with a manual relative path.\n", "datareg.Registrar.dataset.register(\n", " \"nersc_tutorial:my_desc_dataset_with_relative_path\",\n", " \"1.0.0\",\n", - " relative_path=\"nersc_tutorial/my_desc_dataset\",\n", - " location_type=\"dummy\", # for testing, means we need no data\n", + " relative_path=f\"NERSC_tutorial/{OWNER}/my_desc_dataset\",\n", + " location_type=\"dummy\", # for testing, means we need no actual data to exist\n", ")" ] }, @@ -216,19 +238,21 @@ "cell_type": "code", "execution_count": null, "id": "718d1cd8-4517-4597-9e36-e403e219cef2", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "from dataregistry.dataset_util import get_dataset_status\n", + "from dataregistry.registrar.dataset_util import get_dataset_status\n", "\n", "# The `get_dataset_status` function takes in a dataset `status` and a bit index, and returns if that bit is True or False\n", "dataset_status = 1\n", "\n", "# Is dataset valid?\n", - "print(f\"Dataset is valid: {get_dataset_status(dataset_status, \"valid\"}\")\n", + "print(f\"Dataset is valid: {get_dataset_status(dataset_status, 'valid')}\")\n", "\n", "# Is dataset replaced?\n", - "print(f\"Dataset is replaced: {get_dataset_status(dataset_status, \"replaced\"}\")" + "print(f\"Dataset is replaced: {get_dataset_status(dataset_status, 'replaced')}\")" ] }, { @@ -257,9 +281,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "DREGS-env", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -271,7 +295,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/source/tutorial_notebooks/production_schema.ipynb b/docs/source/tutorial_notebooks/production_schema.ipynb index 245ebd6b..25555438 100644 --- a/docs/source/tutorial_notebooks/production_schema.ipynb +++ b/docs/source/tutorial_notebooks/production_schema.ipynb @@ -25,7 +25,6 @@ "\n", "- Connect to the production schema and register a new dataset (admin only)\n", "- Query the production schema\n", - "- Transfer a dataset from another schema to the production schema (admin only)\n", "\n", "### Before we begin\n", "\n", @@ -43,8 +42,20 @@ }, "outputs": [], "source": [ + "# Come up with a random owner name to avoid clashes\n", + "from random import randint\n", + "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n", + "\n", "import dataregistry\n", - "print(\"Working with dataregistry version:\", dataregistry.__version__)" + "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")" + ] + }, + { + "cell_type": "markdown", + "id": "782179b4-4349-4199-b3a3-38d4845188a9", + "metadata": {}, + "source": [ + "**Note** that running some of the cells below may fail, especially if ran multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration." ] }, { @@ -71,7 +82,7 @@ "from dataregistry import DataRegistry\n", "\n", "# Establish connection to the production schema\n", - "datareg = DataRegistry(schema=\"production\", owner=\"DESC CO group\", owner_type=\"production\")" + "datareg = DataRegistry(schema=\"tutorial_production\", owner=\"production\", owner_type=\"production\")" ] }, { @@ -79,9 +90,7 @@ "id": "6f7423fb-32d0-4a33-8e87-cd75e952512f", "metadata": {}, "source": [ - "Here we have connected to the data registry production schema (`schema=\"production\"`). Notice we have assigned a universal owner (`owner=\"DESC CO group\"`) and owner type (`owner_type=\"production\"`) to save some time when registering the datasets during this instance.\n", - "\n", - "Note for the production schema no value other than `production` will be allowed for `owner_type` (the inverse is also true for any schema other than production)." + "Here we have connected to the data registry tutorial production schema (`schema=\"tutorial_production\"`). We have assigned the universal `owner` and `owner_type` to be \"production\", which is the only values allowed for the production schema." ] }, { @@ -93,17 +102,12 @@ }, "outputs": [], "source": [ - "# Production datasets can't be overwritten, so for the purposes of this tutorial, let's generate a random unique name\n", - "import numpy as np\n", - "tag = np.rrandom.andint(0, 100000)\n", - "\n", "# Add new entry.\n", "dataset_id, execution_id = datareg.Registrar.dataset.register(\n", - " f\"nersc_production_tutorial/my_desc_production_dataset_{tag}\",\n", + " f\"nersc_production_tutorial:my_desc_production_dataset_{OWNER}\",\n", " \"1.0.0\",\n", " description=\"An production output from some DESC code\",\n", - " old_location=\"dummy_production_dataset.txt\",\n", - " is_dummy=True\n", + " location_type=\"dummy\"\n", ")\n", "\n", "print(f\"Created dataset {dataset_id}, associated with execution {execution_id}\")" @@ -120,7 +124,7 @@ "\n", "To recap about production datasets:\n", "- Only administrators have write access to the production schema and shared space\n", - "- All datasets in the production schema have `owner_type=\"production\"`\n", + "- All datasets in the production schema have `owner=\"production\"` and `owner_type=\"production\"`\n", "- Production datasets can never be overwritten, even if `is_overwritable=True`" ] }, @@ -147,8 +151,8 @@ }, "outputs": [], "source": [ - "# Create a filter that queries on the dataset name\n", - "f = datareg.Query.gen_filter('dataset.name', '==', 'my_desc_production_dataset')\n", + "# Create a filter that queries on the owner\n", + "f = datareg.Query.gen_filter('dataset.owner', '==', 'production')\n", "\n", "# Query the database\n", "results = datareg.Query.find_datasets(['dataset.dataset_id', 'dataset.name', 'dataset.owner',\n", @@ -166,31 +170,13 @@ "source": [ "Note that when using the command line interface to query datasets, e.g., `dregs ls`, both the default schema you are connected to and the production schema are both searched." ] - }, - { - "cell_type": "markdown", - "id": "db6a2ac8-80ad-4038-a722-de9de8fbe433", - "metadata": {}, - "source": [ - "## Transferring datasets to the production schema\n", - "\n", - "TBD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb87beb4-937c-498c-b1f2-de32cab29b17", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "DREGS-env", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -202,7 +188,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/source/tutorial_notebooks/query_datasets.ipynb b/docs/source/tutorial_notebooks/query_datasets.ipynb index 4c9558dd..29122609 100644 --- a/docs/source/tutorial_notebooks/query_datasets.ipynb +++ b/docs/source/tutorial_notebooks/query_datasets.ipynb @@ -9,7 +9,7 @@ " \n", "\n", "\n", - "# Getting started: Part 2 - Simple queries\n", + "# Getting started: Part 3 - Simple queries\n", "\n", "Here we continue our getting started tutorial, introducing queries.\n", "\n", @@ -19,7 +19,7 @@ "\n", "1) Perform a simple query with a single filter\n", "2) Perform a simple query with multiple filters\n", - "3) Query for all datasets with a keyword\n", + "3) Query for all datasets tagged with a given keyword\n", "\n", "### Before we begin\n", "\n", @@ -37,8 +37,20 @@ }, "outputs": [], "source": [ + "# Come up with a random owner name to avoid clashes\n", + "from random import randint\n", + "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n", + "\n", "import dataregistry\n", - "print(\"Working with dataregistry version:\", dataregistry.__version__)" + "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2140d287-56de-4e94-a12b-959e13e28a9c", + "metadata": {}, + "source": [ + "**Note** that running some of the cells below may fail, especially if ran multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration." ] }, { @@ -64,8 +76,8 @@ "source": [ "from dataregistry import DataRegistry\n", "\n", - "# Establish connection to database (using defaults)\n", - "datareg = DataRegistry()" + "# Establish connection to the tutorial schema\n", + "datareg = DataRegistry(schema=\"tutorial_working\", owner=OWNER)" ] }, { @@ -77,7 +89,7 @@ "\n", "Queries are constructed from one or more boolean logic \"filters\", which translate to SQL `WHERE` clauses in the code. \n", "\n", - "For example, to create a filter that will query for all datasets in registry with the name \"my_desc_dataset\" would be as follows:" + "For example, to create a filter that will query for all datasets in registry with the name \"nersc_tutorial:my_first_desc_dataset\" would be as follows:" ] }, { @@ -90,7 +102,7 @@ "outputs": [], "source": [ "# Create a filter that queries on the dataset name\n", - "f = datareg.Query.gen_filter('dataset.name', '==', 'my_desc_dataset')" + "f = datareg.Query.gen_filter('dataset.name', '==', 'nersc_tutorial:my_first_desc_dataset')" ] }, { @@ -172,7 +184,9 @@ "cell_type": "code", "execution_count": null, "id": "54a52029-2908-4056-bc68-4a87f6c3e6df", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "print(datareg.Query.get_all_columns())" @@ -187,7 +201,7 @@ "\n", "We are not limited to using a single filter during queries.\n", "\n", - "Now let's say we want to return all datasets in the registry with a particular `owner`, that were registered after a certain date. We also want the results in a Pandas dataframe format.\n", + "Now let's say we want to return all datasets in the registry with a particular `owner_type`, that were registered after a certain date. We also want the results in a Pandas dataframe format.\n", "\n", "To do this we construct two filter objects, i.e.:" ] @@ -196,11 +210,13 @@ "cell_type": "code", "execution_count": null, "id": "8eec33d8-2139-473f-ab27-3a04ebd5e7f1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Create a filter that queries on the owner\n", - "f = datareg.Query.gen_filter('dataset.owner', '==', 'DESC')\n", + "f = datareg.Query.gen_filter('dataset.owner_type', '==', 'group')\n", "\n", "# Create a 2nd filter that queries on the entry date\n", "f2 = datareg.Query.gen_filter('dataset.creation_date', '>', '01-01-2024')" @@ -218,12 +234,14 @@ "cell_type": "code", "execution_count": null, "id": "d21e982a-5b86-4f75-8b54-7923dec11e04", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Query the database\n", "results = datareg.Query.find_datasets(['dataset.dataset_id', 'dataset.name', 'dataset.owner',\n", - " 'dataset.relative_path', 'dataset.creation_date'],\n", + " 'dataset.relative_path', 'dataset.creation_date', 'dataset.owner_type'],\n", " [f,f2],\n", " return_format=\"dataframe\")" ] @@ -240,7 +258,9 @@ "cell_type": "code", "execution_count": null, "id": "908aa870-c0a4-4e59-a11c-97185e4a3db1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "print(results)" @@ -262,7 +282,9 @@ "cell_type": "code", "execution_count": null, "id": "22310484-e8e5-41c3-8a52-c0f0bc3773ff", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Create a filter on a given keyword\n", @@ -272,15 +294,25 @@ "results = datareg.Query.find_datasets(['dataset.dataset_id', 'dataset.name', 'dataset.owner',\n", " 'dataset.relative_path', 'dataset.creation_date'],\n", " [f],\n", - " return_format=\"dataframe\")" + " return_format=\"dataframe\")\n", + "\n", + "print(results)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdbe8537-6195-4239-bbb8-976daacdfab7", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "DREGS-env", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -292,7 +324,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/source/tutorial_notebooks/register_datasets.ipynb b/docs/source/tutorial_notebooks/register_datasets.ipynb index fca85a2c..a2de950b 100644 --- a/docs/source/tutorial_notebooks/register_datasets.ipynb +++ b/docs/source/tutorial_notebooks/register_datasets.ipynb @@ -25,7 +25,7 @@ "4) Replace a dataset\n", "5) Modify a previously registered dataset with updated metadata\n", "6) Delete a dataset\n", - "7) Registering external datasets\n", + "7) Register an \"external\" dataset\n", "8) Recap\n", "\n", "### Before we begin\n", @@ -44,8 +44,20 @@ }, "outputs": [], "source": [ + "# Come up with a random owner name to avoid clashes\n", + "from random import randint\n", + "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n", + "\n", "import dataregistry\n", - "print(\"Working with dataregistry version:\", dataregistry.__version__)" + "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d53aab85-bedf-47f6-a804-34a29e72631f", + "metadata": {}, + "source": [ + "**Note** that running some of the cells below may fail, especially if ran multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration." ] }, { @@ -72,7 +84,7 @@ "from dataregistry import DataRegistry\n", "\n", "# Establish connection to database (using defaults)\n", - "datareg = DataRegistry()" + "#datareg = DataRegistry() # This is commented out as we will make our connection to the tutorial schema below" ] }, { @@ -82,8 +94,8 @@ "source": [ "With no arguments, the `DataRegistry` class will automatically attempt to:\n", "- establish a connection to the registry database using the information in your `~/.config_reg_access` and `~/.pgpass` files\n", - "- connect to the default database schema\n", - "- use the default NERSC \"`site`\" for the `root_dir`\n", + "- connect to the default \"working\" database schema\n", + "- use the default NERSC `site` for the `root_dir`\n", "\n", "The root directory (`root_dir`) is the base path under which all ingested data will be copied. Other than for testing, this should generally be the NERSC `site` address." ] @@ -140,6 +152,26 @@ "# datareg = DataRegistry(owner=\"desc\", owner_type=\"group\")" ] }, + { + "cell_type": "markdown", + "id": "d22523db-ae92-4474-bb96-5ad98a404d61", + "metadata": {}, + "source": [ + "For these tutorials, there is a stand alone working (`tutorial_working`) and production (`tutorial_production`) tutorial schema that we will connect to as to not interupt the default DESC schemas with random entries. If you are practicing using the `dataregistry` outwith these tutorial notebooks, feel free to also use the tutorial schemas for your entries. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f994c54-1dec-4bcc-b773-3092f0ba40aa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "datareg = DataRegistry(schema=\"tutorial_working\", owner=OWNER)" + ] + }, { "cell_type": "markdown", "id": "2d723a37-4101-496c-b385-0a2644aa7ad8", @@ -154,7 +186,9 @@ "cell_type": "code", "execution_count": null, "id": "6797be3b-434f-4245-a276-a32d9294d1ca", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Make some temporary text files that we can practice ingesting into the dataregistry with\n", @@ -192,7 +226,6 @@ " \"nersc_tutorial:my_first_desc_dataset\", # `name`\n", " \"1.0.0\", # `version`\n", " description=\"An output from some DESC code\",\n", - " owner=\"DESC\",\n", " owner_type=\"group\",\n", " is_overwritable=True,\n", " old_location=temp_files[0].name\n", @@ -241,9 +274,9 @@ "\n", "Registering a dataset does two things; it creates an entry in the DESC data registry database with the appropriate metadata, and it (optionally) copies the dataset contents to the `root_dir`. \n", "\n", - "If the data are already at the correct relative path within the `root_dir`, leave `old_location=None` and set the `relative_path` option to point to the location of the data within the `root_dir` (see special cases section for more information on the `relative_path`). However it's likely for most users the data will need to be copied from another location to the `root_dir`. That initial location may be specified using the `old_location` parameter as we have done in the example above. \n", + "If the data are already at the correct relative path within the `root_dir`, leave `old_location=None` and set the `relative_path` option to point to the location of the data within the `root_dir` (see the next tutorial, \"A deeper look\", for more information on the `relative_path`). However it's likely for most users the data will need to be copied from another location to the `root_dir`. That initial location may be specified using the `old_location` parameter as we have done in the example above. \n", "\n", - "In our example we have created a dummy text file as our dataset and ingested it into the data registry, however this can be any file or directory (directories will be recursively copied).\n", + "In our example we have ingested one of the dummy text files into the data registry, however this can be any file or directory (directories will be recursively copied).\n", "\n", "Note that the dataregistry does not support registering datasets through symbolic links (symlinks).\n", "\n", @@ -261,14 +294,16 @@ "\n", "If you have a dataset that has been previously registered within the data registry, and that dataset has updates, it is simple to register the updated version.\n", "\n", - "Register the new dataset using the same process as before, making sure to keep the same dataset `name`, but updating the dataset version. One can update the version in two ways: manually entering a new version string, or having the dataregistry automatically \"bump\" the dataset version by selecing either \"major\", \"minor\" or \"patch\" for the version string. For example, let's register an updated version of our dataset, bumping the minor tag (i.e., bumping 1.0.0 -> 1.1.0)." + "Register the new dataset using the same process as before, making sure to keep the same dataset `name`, but updating the dataset `version` (and/or `version_suffix`). One can update the version in two ways: (1) manually entering a new version string, or (2) having the dataregistry automatically \"bump\" the dataset version by selecing either \"major\", \"minor\" or \"patch\" for the version string. For example, let's register an updated version of our dataset, bumping the minor tag (i.e., bumping 1.0.0 -> 1.1.0)." ] }, { "cell_type": "code", "execution_count": null, "id": "2a65d3c0-41c1-4720-85be-10d68cef84f9", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new entry for an updated dataset with an updated version.\n", @@ -276,7 +311,6 @@ " \"nersc_tutorial:my_first_desc_dataset\",\n", " \"minor\", # Automatically bumps to \"1.1.0\"\n", " description=\"An output from some DESC code (updated)\",\n", - " owner=\"DESC\",\n", " owner_type=\"group\",\n", " is_overwritable=True,\n", " old_location=temp_files[1].name,\n", @@ -317,7 +351,9 @@ "cell_type": "code", "execution_count": null, "id": "8aa09c52-8283-4f91-bb62-490e65acbb4d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new entry, overwriting the data in the `root_dir`.\n", @@ -325,7 +361,6 @@ " \"nersc_tutorial:my_first_desc_dataset\",\n", " \"1.0.0\", \n", " description=\"An output from some DESC code (further updated)\",\n", - " owner=\"DESC\",\n", " owner_type=\"group\",\n", " is_overwritable=True,\n", " old_location=temp_files[2].name,\n", @@ -343,7 +378,7 @@ "id": "19afb001-d82f-448e-b40d-e4249e534286", "metadata": {}, "source": [ - "Only `valid` datasets with `is_overwritable=True` set or `invalid` datasets can be replaced. Deleted datasets, or archived datasets, cannot be replaced (see next tutorial for information about a datasets `status`)." + "Only `valid` datasets with `is_overwritable=True` set, or `invalid` datasets can be replaced (invalid datasets are those that failed to register, most likely due to a copying error or interuption). Deleted datasets, or archived datasets, cannot be replaced (see next tutorial for information about a datasets `status`)." ] }, { @@ -362,7 +397,9 @@ "cell_type": "code", "execution_count": null, "id": "4252ce7e-7dea-4404-ae61-19b8ca0be2fd", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# What columns in the dataset table are modifiable?\n", @@ -383,7 +420,9 @@ "cell_type": "code", "execution_count": null, "id": "5f20548e-148b-44af-a22a-b1e259d5e994", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# A key-value dict of the columns we want to update, with their new values\n", @@ -399,14 +438,16 @@ "source": [ "## 6) Deleting a dataset in the dataregistry\n", "\n", - "To delete a dataset entry from the dataregistry we call the .delete() function which accepts one argument, the dataset_id of the entry you wish to delete, e.g.," + "To delete a dataset entry from the dataregistry we call the .delete() function which accepts one argument, the `dataset_id` of the entry you wish to delete, e.g.," ] }, { "cell_type": "code", "execution_count": null, "id": "becbc567-e7a0-479d-8a1d-4a315d781bd3", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Delete dataset with entry ID == dataset_id\n", @@ -418,7 +459,7 @@ "id": "594c3b1a-8491-4f6e-b47b-97de00cccd9b", "metadata": {}, "source": [ - "Note that this will remove the dataset data stored under the root_dir, however the entry within the registry database will remain (with an updated status indicated the dataset was deleted)." + "Note that this will remove the files and/or directories associated with the dataset under the `root_dir`, however the entry within the registry database will remain (with an updated `status` bit indicating the dataset was deleted)." ] }, { @@ -426,7 +467,7 @@ "id": "8f6909cf-9826-4533-8b91-bb7a63079b37", "metadata": {}, "source": [ - "## 7) Registering external datasets\n", + "## 7) Registering \"external\" datasets\n", "\n", "Typically when we register datasets we are asking the `dataregistry` to collate provenance data for the dataset and to physically manage the data (either copy the data to the central `root_dir` or verify that it already exists there).\n", "\n", @@ -441,7 +482,9 @@ "cell_type": "code", "execution_count": null, "id": "e959f76f-b5e1-43a6-9932-ee1866cc398d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new external dataset entry.\n", @@ -486,9 +529,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "DREGS-env", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -500,7 +543,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.9.18" } }, "nbformat": 4,