From a1478206434c98d2aa468a8da96958768192e124 Mon Sep 17 00:00:00 2001
From: James Wood <jamesfwood@hotmail.com>
Date: Fri, 26 Apr 2024 11:15:52 -0700
Subject: [PATCH] Feature/setup tests (#711)

* Add C1247485682-LARC_CLOUD

* remove ids

* add temp limit

* update cleanup

* remove temp limit


---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: jamesfwood <jamesfwood@users.noreply.github.com>
Co-authored-by: James Wood <James.F.Wood@jpl.nasa.gov>
---
 .github/workflows/diff.yml                    |  25 +-
 .gitignore                                    |   2 +-
 README.md                                     |   8 +-
 .../__init__.py                               |   0
 .../cmr_association_diff.py                   |   0
 pyproject.toml                                |   8 +-
 tests/cmr/concise/ops/C1940473819-POCLOUD     |   1 +
 tests/cmr/concise/uat/C1234724470-POCLOUD     |   1 +
 tests/collection_names.py                     |   2 +-
 tests/conftest.py                             |   2 +-
 tests/remove_prs.py                           |   2 +-
 tests/verify_collection.py                    | 264 +++++++++---------
 12 files changed, 163 insertions(+), 152 deletions(-)
 rename {l2ss_py_autotest => concise_autotest}/__init__.py (100%)
 rename {l2ss_py_autotest => concise_autotest}/cmr_association_diff.py (100%)
 create mode 100644 tests/cmr/concise/ops/C1940473819-POCLOUD
 create mode 100644 tests/cmr/concise/uat/C1234724470-POCLOUD

diff --git a/.github/workflows/diff.yml b/.github/workflows/diff.yml
index 2eee0048..e3950dd5 100644
--- a/.github/workflows/diff.yml
+++ b/.github/workflows/diff.yml
@@ -1,5 +1,5 @@
 # Run every third day starting from the 2nd of the month 1 am pacific 
-# check for new collection associations to the l2ss-py UMM-S record
+# check for new collection associations to the concise UMM-S record
 # in UAT and OPS. If a new association is found, a PR is opened to add the new collection concept id to the
 # cmr/concise/*_associations.txt file.
 
@@ -33,21 +33,17 @@ jobs:
         run: |
           poetry install
 
-          ls $GITHUB_WORKSPACE/tests/cmr/concise/uat/ > $GITHUB_WORKSPACE/tests/cmr/concise/uat_associations.txt
-          ls $GITHUB_WORKSPACE/tests/cmr/concise/ops/ > $GITHUB_WORKSPACE/tests/cmr/concise/ops_associations.txt
+          ls $GITHUB_WORKSPACE/tests/cmr/concise/uat/ > uat_associations.txt
+          ls $GITHUB_WORKSPACE/tests/cmr/concise/ops/ > ops_associations.txt
 
-          poetry run cmr_association_diff -e uat -t service -p POCLOUD -n 'PODAAC Concise' -a $GITHUB_WORKSPACE/tests/cmr/concise/uat_associations.txt --token $UAT_TOKEN_TEMP > $GITHUB_WORKSPACE/tests/cmr/concise/new_uat_associations.txt
-          poetry run cmr_association_diff -e ops -t service -p POCLOUD -n 'PODAAC Concise' -a $GITHUB_WORKSPACE/tests/cmr/concise/ops_associations.txt --token $OPS_TOKEN_TEMP > $GITHUB_WORKSPACE/tests/cmr/concise/new_ops_associations.txt
+          poetry run cmr_association_diff -e uat -t service -p POCLOUD -n 'PODAAC Concise' -a uat_associations.txt --token $UAT_TOKEN_TEMP > new_uat_associations.txt
+          poetry run cmr_association_diff -e ops -t service -p POCLOUD -n 'PODAAC Concise' -a ops_associations.txt --token $OPS_TOKEN_TEMP > new_ops_associations.txt
 
-          echo "new_uat_associations=$(poetry run python tests/collection_names.py --env uat --token $UAT_TOKEN_TEMP --file $GITHUB_WORKSPACE/tests/cmr/concise/new_uat_associations.txt)" >> $GITHUB_OUTPUT
-          echo "new_ops_associations=$(poetry run python tests/collection_names.py --env ops --token $OPS_TOKEN_TEMP --file $GITHUB_WORKSPACE/tests/cmr/concise/new_ops_associations.txt)" >> $GITHUB_OUTPUT
+          echo "new_uat_associations=$(poetry run python tests/collection_names.py --env uat --token $UAT_TOKEN_TEMP --file new_uat_associations.txt)" >> $GITHUB_OUTPUT
+          echo "new_ops_associations=$(poetry run python tests/collection_names.py --env ops --token $OPS_TOKEN_TEMP --file new_ops_associations.txt)" >> $GITHUB_OUTPUT
 
-          rm $GITHUB_WORKSPACE/tests/cmr/concise/uat_associations.txt
-          rm $GITHUB_WORKSPACE/tests/cmr/concise/ops_associations.txt
-          rm $GITHUB_WORKSPACE/tests/cmr/concise/new_uat_associations.txt
-          rm $GITHUB_WORKSPACE/tests/cmr/concise/new_ops_associations.txt
+          rm *_associations.txt
   open_pr_uat:
-    if: false
     needs: find_new
     strategy:
       fail-fast: false
@@ -80,7 +76,7 @@ jobs:
           delete-branch: true
           title: UAT ${{ matrix.data.concept_id }} (${{ matrix.data.short_name }})
           body: |
-            New association between l2ss-py and ${{ matrix.data.concept_id }} found in UAT.
+            New association between concise and ${{ matrix.data.concept_id }} found in UAT.
             Beginning verification of collection.
           labels: |
             unverified
@@ -92,7 +88,6 @@ jobs:
   
 
   open_pr_ops:
-    if: false
     needs: find_new
     strategy:
       fail-fast: false
@@ -125,7 +120,7 @@ jobs:
           delete-branch: true
           title: OPS ${{ matrix.data.concept_id }} (${{ matrix.data.short_name }})
           body: |
-            New association between l2ss-py and ${{ matrix.data.concept_id }} found in OPS.
+            New association between concise and ${{ matrix.data.concept_id }} found in OPS.
             Beginning verification of collection.
           labels: |
             unverified
diff --git a/.gitignore b/.gitignore
index 269cc332..7472af5b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 /.idea/inspectionProfiles/Project_Default.xml
 /.idea/.gitignore
 /.idea/encodings.xml
-/.idea/l2ss-py-autotest.iml
+/.idea/concise-autotest.iml
 /.idea/misc.xml
 /.idea/modules.xml
 /.idea/vcs.xml
diff --git a/README.md b/README.md
index ed53f28e..71e0d905 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
-# l2ss-py-autotest
+# concise-autotest
 
-This repository contains functional/integration tests for l2ss-py. It also includes github
+This repository contains functional/integration tests for concise. It also includes github
 action workflows for automatically running these tests whenever a new collection gets
-associated to the l2ss-py UMM-S record.
+associated to the concise UMM-S record.
 
 ## How it works
 
@@ -17,7 +17,7 @@ associated to the l2ss-py UMM-S record.
 ## What to do if tests fail
 
 If a test fails, meaning an assertion did not succeed, or an unknown error occurs action must be taken. The cause of the failure should be determined and fixed.
-A failing test generally indicates an issue with either metadata or l2ss-py itself and may require additional steps.
+A failing test generally indicates an issue with either metadata or concise itself and may require additional steps.
 In some cases, the test may need to be updated to account for a unique edge case.
 
 ## What to do if tests are skipped
diff --git a/l2ss_py_autotest/__init__.py b/concise_autotest/__init__.py
similarity index 100%
rename from l2ss_py_autotest/__init__.py
rename to concise_autotest/__init__.py
diff --git a/l2ss_py_autotest/cmr_association_diff.py b/concise_autotest/cmr_association_diff.py
similarity index 100%
rename from l2ss_py_autotest/cmr_association_diff.py
rename to concise_autotest/cmr_association_diff.py
diff --git a/pyproject.toml b/pyproject.toml
index ccf7027a..158f6a69 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,11 @@
 [tool.poetry]
-name = "l2ss-py-autotest"
+name = "concise-autotest"
 version = "0.1.0"
-description = "Automated tests for new associations to l2ss-py service"
+description = "Automated tests for new associations to concise service"
 authors = ["PO.DAAC <podaac@jpl.nasa.gov>"]
 license = "Apache 2.0"
 readme = "README.md"
-packages = [{include = "l2ss_py_autotest"}]
+packages = [{include = "concise_autotest"}]
 
 [tool.poetry.dependencies]
 python = "^3.9"
@@ -34,4 +34,4 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
-cmr_association_diff = "l2ss_py_autotest.cmr_association_diff:run"
+cmr_association_diff = "concise_autotest.cmr_association_diff:run"
diff --git a/tests/cmr/concise/ops/C1940473819-POCLOUD b/tests/cmr/concise/ops/C1940473819-POCLOUD
new file mode 100644
index 00000000..18fa0802
--- /dev/null
+++ b/tests/cmr/concise/ops/C1940473819-POCLOUD
@@ -0,0 +1 @@
+C1940473819-POCLOUD
diff --git a/tests/cmr/concise/uat/C1234724470-POCLOUD b/tests/cmr/concise/uat/C1234724470-POCLOUD
new file mode 100644
index 00000000..9391ada0
--- /dev/null
+++ b/tests/cmr/concise/uat/C1234724470-POCLOUD
@@ -0,0 +1 @@
+C1234724470-POCLOUD
diff --git a/tests/collection_names.py b/tests/collection_names.py
index 6edd7d47..6070d999 100644
--- a/tests/collection_names.py
+++ b/tests/collection_names.py
@@ -7,7 +7,7 @@ def main():
 
     parser = argparse.ArgumentParser(description="Get Collection Names from CMR")
     parser.add_argument("--token", help="launchpad token")
-    parser.add_argument("--file", help="file with list of l2ss associations")
+    parser.add_argument("--file", help="file with list of concise associations")
     parser.add_argument("--env", help="CMR environment")
 
     args = parser.parse_args()
diff --git a/tests/conftest.py b/tests/conftest.py
index a39e5560..5755a4e9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -50,7 +50,7 @@ def pytest_addoption(parser):
 
 def pytest_generate_tests(metafunc):
     if metafunc.config.option.regression:
-        cmr_dirpath = pathlib.Path('cmr')
+        cmr_dirpath = pathlib.Path('cmr/concise')
 
         association_dir = 'uat' if metafunc.config.option.env == 'uat' else 'ops'
         associations = os.listdir(cmr_dirpath.joinpath(association_dir))
diff --git a/tests/remove_prs.py b/tests/remove_prs.py
index f87cdb3b..ac33f560 100644
--- a/tests/remove_prs.py
+++ b/tests/remove_prs.py
@@ -26,7 +26,7 @@
 
     # Replace these variables with your own values
     repo_owner = 'podaac'
-    repo_name = 'l2ss-py-autotest'
+    repo_name = 'concise-autotest'
     github_token = os.getenv("GITHUB_TOKEN")
 
     # Initialize a Github instance
diff --git a/tests/verify_collection.py b/tests/verify_collection.py
index a4feecd6..18a28007 100644
--- a/tests/verify_collection.py
+++ b/tests/verify_collection.py
@@ -10,6 +10,7 @@
 import netCDF4
 import numpy as np
 import podaac.subsetter.subset
+import unittest
 import pytest
 import requests
 import xarray
@@ -336,142 +337,155 @@ def get_lat_lon_var_names(dataset: xarray.Dataset, file_to_subset: str, collecti
     pytest.fail(f"Unable to find latitude and longitude variables.")
 
 
-@pytest.mark.timeout(600)
-def test_spatial_subset(collection_concept_id, env, granule_json, collection_variables,
-                        harmony_env, tmp_path: pathlib.Path, bearer_token):
-    test_spatial_subset.__doc__ = f"Verify spatial subset for {collection_concept_id} in {env}"
-
-    logging.info("Using granule %s for test", granule_json['meta']['concept-id'])
+def verify_dims(merged_group, origin_group, both_merged):
+    for dim in origin_group.dimensions:
+        if both_merged:
+            unittest.TestCase().assertEqual(merged_group.dimensions[dim].size, origin_group.dimensions[dim].size)
+        else:
+            unittest.TestCase().assertGreaterEqual(merged_group.dimensions[dim].size, origin_group.dimensions[dim].size)
 
-    # Compute a box that is smaller than the granule extent bounding box
-    north, south, east, west = get_bounding_box(granule_json)
-    east, west, north, south = create_smaller_bounding_box(east, west, north, south, .95)
 
-    # Build harmony request
-    harmony_client = harmony.Client(env=harmony_env, token=bearer_token)
-    request_bbox = harmony.BBox(w=west, s=south, e=east, n=north)
-    request_collection = harmony.Collection(id=collection_concept_id)
-    harmony_request = harmony.Request(collection=request_collection, spatial=request_bbox,
-                                      granule_id=[granule_json['meta']['concept-id']])
-    logging.info("Sending harmony request %s", harmony_client.request_as_curl(harmony_request))
-
-    # Submit harmony request and download result
-    job_id = harmony_client.submit(harmony_request)
-    logging.info("Submitted harmony job %s", job_id)
-    harmony_client.wait_for_processing(job_id, show_progress=True)
-    subsetted_filepath = None
-    for filename in [file_future.result()
-                     for file_future
-                     in harmony_client.download_all(job_id, directory=f'{tmp_path}', overwrite=True)]:
-        logging.info(f'Downloaded: %s', filename)
-        subsetted_filepath = pathlib.Path(filename)
-
-    # Verify spatial subset worked
-    subsetted_ds = xarray.open_dataset(subsetted_filepath, decode_times=False)
-    group = None
-    # Try to read group in file
-    lat_var_name, lon_var_name = get_lat_lon_var_names(subsetted_ds, subsetted_filepath, collection_variables)
-
-    with netCDF4.Dataset(subsetted_filepath) as f:
-        group_list = []
-        def group_walk(groups, nc_d, current_group):
-            global subsetted_ds_new
-            subsetted_ds_new = None
-            # check if the top group has lat or lon variable
-            if lat_var_name in list(nc_d.variables.keys()):
-                subsetted_ds_new = subsetted_ds
-            else:
-                # if not then we'll need to keep track of the group layers
-                group_list.append(current_group)
-
-            # loop through the groups in the current layer
-            for g in groups:
-                # end the loop if we've already found latitude
-                if subsetted_ds_new:
-                    break
-                # check if the groups have latitude, define the dataset and end the loop if found
-                if lat_var_name in list(nc_d.groups[g].variables.keys()):
-                    group_list.append(g)
-                    g = '/'.join(group_list)
-                    subsetted_ds_new = xarray.open_dataset(subsetted_filepath, group=g, decode_times=False)
-                    break
-                # recall the function on a group that has groups in it and didn't find latitude
-                # this is going 'deeper' into the groups
-                if len(list(nc_d.groups[g].groups.keys())) > 0:
-                    group_walk(nc_d.groups[g].groups, nc_d.groups[g], g)
-                else:
-                    continue
-
-        group_walk(f.groups, f, '')
-
-    assert lat_var_name and lon_var_name
-
-    var_ds = None
-    msk = None
-
-    if science_vars := get_science_vars(collection_variables):
-        for idx, value in enumerate(science_vars):
-            science_var_name = science_vars[idx]['umm']['Name']
-            try:
-                var_ds = subsetted_ds_new[science_var_name]
-                msk = np.logical_not(np.isnan(var_ds.data.squeeze()))
-                break
-            except Exception:
-                try:
-                    # if the variable couldn't be found because the name includes a group, e.g.,
-                    # `geolocation/relative_azimuth_angle`,
-                    # then try to access the variable after removing the group name.
-                    var_ds = subsetted_ds_new[science_var_name.rsplit("/", 1)[-1]]
-                    msk = np.logical_not(np.isnan(var_ds.data.squeeze()))
-                    break
-                except Exception:
-                    var_ds = None
-                    msk = None
-
-        if var_ds is None and msk is None:
-            pytest.fail(f"Unable to find variable from umm-v to use as science variable.")
+def verify_attrs(merged_obj, origin_obj, both_merged):
+    ignore_attributes = [
+        'request-bounding-box', 'request-bounding-box-description', 'PODAAC-dataset-shortname',
+        'PODAAC-persistent-ID', 'time_coverage_end', 'time_coverage_start'
+    ]
 
-    else:
-        # Can't find a science var in UMM-V, just pick one
+    merged_attrs = merged_obj.ncattrs()
+    origin_attrs = origin_obj.ncattrs()
 
-        science_var_name = next(iter([v for v in subsetted_ds_new.variables if
-                                    str(v) not in lat_var_name and str(v) not in lon_var_name and 'time' not in str(v)]))
+    for attr in origin_attrs:
+        if attr in ignore_attributes:
+            # Skip attributes which are present in the Java implementation,
+            # but not (currently) present in the Python implementation
+            continue
 
-        var_ds = subsetted_ds_new[science_var_name]
+        if not both_merged and attr not in merged_attrs:
+            # Skip attributes which are not present in both merged and origin.
+            # This is normal operation as some attributes may be omited b/c
+            # they're inconsistent between granules
+            continue
 
-    try:
-        msk = np.logical_not(np.isnan(var_ds.data.squeeze()))
-        llat = subsetted_ds_new[lat_var_name].where(msk)
-        llon = subsetted_ds_new[lon_var_name].where(msk)
-    except ValueError:
-        
-        llat = subsetted_ds_new[lat_var_name]
-        llon = subsetted_ds_new[lon_var_name]
+        merged_attr = merged_obj.getncattr(attr)
+        if both_merged and isinstance(merged_attr, int):
+            # Skip integer values - the Java implementation seems to omit
+            # these values due to its internal handling of all values as
+            # Strings
+            continue
 
-    lat_max = llat.max()
-    lat_min = llat.min()
+        origin_attr = origin_obj.getncattr(attr)
+        if isinstance(origin_attr, np.ndarray):
+            unittest.TestCase().assertTrue(np.array_equal(merged_attr, origin_attr))
+        else:
+            if attr != "history_json":
+                unittest.TestCase().assertEqual(merged_attr, origin_attr)
 
-    lon_min = llon.min()
-    lon_max = llon.max()
 
-    lon_min = (lon_min + 180) % 360 - 180
-    lon_max = (lon_max + 180) % 360 - 180
+def verify_variables(merged_group, origin_group, subset_index, both_merged):
+    for var in origin_group.variables:
+        merged_var = merged_group.variables[var]
+        origin_var = origin_group.variables[var]
 
-    lat_var_fill_value = subsetted_ds_new[lat_var_name].encoding.get('_FillValue')
-    lon_var_fill_value = subsetted_ds_new[lon_var_name].encoding.get('_FillValue')
+        verify_attrs(merged_var, origin_var, both_merged)
 
-    if lat_var_fill_value:
-        if (lat_max <= north or np.isclose(lat_max, north)) and (lat_min >= south or np.isclose(lat_min, south)):
-            logging.info("Successful Latitude subsetting")
-        elif np.isnan(lat_max) and np.isnan(lat_min):
-            logging.info("Partial Lat Success - no Data")
+        if both_merged:
+            # both groups require subset indexes
+            merged_data = merged_var[subset_index[0]]
+            origin_data = origin_var[subset_index[1]]
         else:
-            assert False
+            # merged group requires a subset index
+            merged_data = np.resize(merged_var[subset_index], origin_var.shape)
+            origin_data = origin_var
+
+        equal_nan = True
+        if merged_data.dtype.kind == 'S':
+            equal_nan = False
 
-    if lon_var_fill_value:
-        if (lon_max <= east or np.isclose(lon_max, east)) and (lon_min >= west or np.isclose(lon_min, west)):
-            logging.info("Successful Longitude subsetting")
-        elif np.isnan(lon_max) and np.isnan(lon_min):
-            logging.info("Partial Lon Success - no Data")
+        # verify variable data
+        if isinstance(origin_data, str):
+            unittest.TestCase().assertEqual(merged_data, origin_data)
         else:
-            assert False
+            unittest.TestCase().assertTrue(np.array_equal(merged_data, origin_data, equal_nan=equal_nan))
+
+
+def verify_groups(merged_group, origin_group, subset_index, file=None, both_merged=False):
+    if file:
+        print("verifying groups ....." + file)
+
+    verify_dims(merged_group, origin_group, both_merged)
+    verify_attrs(merged_group, origin_group, both_merged)
+    verify_variables(merged_group, origin_group, subset_index, both_merged)
+
+    for child_group in origin_group.groups:
+        merged_subgroup = merged_group[child_group]
+        origin_subgroup = origin_group[child_group]
+        verify_groups(merged_subgroup, origin_subgroup, subset_index, both_merged=both_merged)
+
+
+def download_file(url, local_path, headers):
+    response = requests.get(url, stream=True, headers=headers)
+    if response.status_code == 200:
+        with open(local_path, 'wb') as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        print("Original File downloaded successfully. " + local_path)
+    else:
+        print(f"Failed to download the file. Status code: {response.status_code}")
+
+
+@pytest.mark.timeout(600)
+def test_concatenate(collection_concept_id, harmony_env, bearer_token):
+
+    max_results = 2
+
+    harmony_client = harmony.Client(env=harmony_env, token=bearer_token)
+    collection = harmony.Collection(id=collection_concept_id)
+
+    request = harmony.Request(
+        collection=collection,
+        concatenate=True,
+        max_results=max_results,
+        skip_preview=True,
+        format="application/x-netcdf4",
+    )
+
+    request.is_valid()
+
+    print(harmony_client.request_as_curl(request))
+
+    job1_id = harmony_client.submit(request)
+
+    print(f'\n{job1_id}')
+
+    print(harmony_client.status(job1_id))
+
+    print('\nWaiting for the job to finish')
+
+    results = harmony_client.result_json(job1_id)
+
+    print('\nDownloading results:')
+
+    futures = harmony_client.download_all(job1_id)
+    file_names = [f.result() for f in futures]
+    print('\nDone downloading.')
+
+    filename = file_names[0]
+
+    # Handle time dimension and variables dropping
+    merge_dataset = netCDF4.Dataset(filename, 'r')
+
+    headers = {
+        "Authorization": f"Bearer {bearer_token}"
+    }
+
+    original_files = merge_dataset.variables['subset_files']
+    history_json = json.loads(merge_dataset.history_json)
+    assert len(original_files) == max_results
+
+    for url in history_json[0].get("derived_from"):
+        local_file_name = os.path.basename(url)
+        download_file(url, local_file_name, headers)
+
+    for i, file in enumerate(original_files):
+        origin_dataset = netCDF4.Dataset(file)
+        verify_groups(merge_dataset, origin_dataset, i, file=file)