diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 90354f74..97f874fa 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -33,7 +33,6 @@ def time_test_cone_filter_multiple_order(): "catalog_name": "test_name", "catalog_type": "object", "total_rows": 10, - "epoch": "J2000", "ra_column": "ra", "dec_column": "dec", } diff --git a/src/hats/catalog/__init__.py b/src/hats/catalog/__init__.py index d23593a2..ee69f91f 100644 --- a/src/hats/catalog/__init__.py +++ b/src/hats/catalog/__init__.py @@ -1,6 +1,7 @@ """Catalog data wrappers""" from .association_catalog import AssociationCatalog +from .association_catalog.partition_join_info import PartitionJoinInfo from .catalog import Catalog from .catalog_type import CatalogType from .dataset.dataset import Dataset diff --git a/src/hats/catalog/dataset/table_properties.py b/src/hats/catalog/dataset/table_properties.py index 1ae322a1..17243d30 100644 --- a/src/hats/catalog/dataset/table_properties.py +++ b/src/hats/catalog/dataset/table_properties.py @@ -12,8 +12,8 @@ ## catalog_name, catalog_type, and total_rows are allowed for ALL types CATALOG_TYPE_ALLOWED_FIELDS = { - CatalogType.OBJECT: ["ra_column", "dec_column"], - CatalogType.SOURCE: ["primary_catalog", "ra_column", "dec_column"], + CatalogType.OBJECT: ["ra_column", "dec_column", "default_columns"], + CatalogType.SOURCE: ["primary_catalog", "ra_column", "dec_column", "default_columns"], CatalogType.ASSOCIATION: [ "primary_catalog", "primary_column", @@ -24,7 +24,7 @@ "contains_leaf_files", ], CatalogType.INDEX: ["primary_catalog", "indexing_column", "extra_columns"], - CatalogType.MARGIN: ["primary_catalog", "margin_threshold", "ra_column", "dec_column"], + CatalogType.MARGIN: ["primary_catalog", "margin_threshold", "ra_column", "dec_column", "default_columns"], } ## catalog_name, catalog_type, and total_rows are required for ALL types @@ -42,6 +42,41 @@ CatalogType.MARGIN: ["primary_catalog", "margin_threshold"], } +# All additional properties in the HATS recommendation. +EXTRA_ALLOWED_FIELDS = [ + "addendum_did", + "bib_reference", + "bib_reference_url", + "creator_did", + "data_ucd", + "hats_builder", + "hats_cols_sort", + "hats_cols_survey_id", + "hats_copyright", + "hats_creation_date", + "hats_creator", + "hats_estsize", + "hats_frame", + "hats_max_rows", + "hats_order", + "hats_progenitor_url", + "hats_release_date", + "hats_service_url", + "hats_status", + "hats_version", + "moc_sky_fraction", + "obs_ack", + "obs_copyright", + "obs_copyright_url", + "obs_description", + "obs_regime", + "obs_title", + "prov_progenitor", + "publisher_id", + "t_max", + "t_min", +] + class TableProperties(BaseModel): """Container class for catalog metadata""" @@ -52,6 +87,8 @@ class TableProperties(BaseModel): ra_column: Optional[str] = Field(default=None, alias="hats_col_j2000_ra") dec_column: Optional[str] = Field(default=None, alias="hats_col_j2000_dec") + default_columns: Optional[List[str]] = Field(default=None, alias="hats_cols_default") + """Which columns should be read from parquet files, when user doesn't otherwise specify.""" primary_catalog: Optional[str] = Field(default=None, alias="hats_primary_table_url") """Reference to object catalog. Relevant for nested, margin, association, and index.""" @@ -86,7 +123,7 @@ class TableProperties(BaseModel): ## Allow any extra keyword args to be stored on the properties object. model_config = ConfigDict(extra="allow", populate_by_name=True, use_enum_values=True) - @field_validator("extra_columns", mode="before") + @field_validator("default_columns", "extra_columns", mode="before") @classmethod def space_delimited_list(cls, str_value: str) -> List[str]: """Convert a space-delimited list string into a python list of strings.""" @@ -95,9 +132,11 @@ def space_delimited_list(cls, str_value: str) -> List[str]: return list(filter(None, re.split(";| |,|\n", str_value))) return str_value - @field_serializer("extra_columns") + @field_serializer("default_columns", "extra_columns") def serialize_as_space_delimited_list(self, str_list: Iterable[str]) -> str: """Convert a python list of strings into a space-delimited string.""" + if str_list is None: + return None return " ".join(str_list) @model_validator(mode="after") @@ -122,8 +161,18 @@ def check_allowed_and_required(self) -> Self: raise ValueError( f"Missing required property for table type {self.catalog_type}: {missing_required}" ) + + # Check against all known properties - catches typos. + non_allowed = set(self.__pydantic_extra__.keys()) - set(EXTRA_ALLOWED_FIELDS) + if len(non_allowed) > 0: + raise ValueError(f"Unexpected extra property: {non_allowed}") return self + def copy_and_update(self, **kwargs): + new_properties = self.model_copy(update=kwargs) + TableProperties.model_validate(new_properties) + return new_properties + def explicit_dict(self): """Create a dict, based on fields that have been explicitly set, and are not "extra" keys.""" explicit = self.model_dump(by_alias=False, exclude_none=True) diff --git a/src/hats/catalog/partition_info.py b/src/hats/catalog/partition_info.py index 7b936567..a3c05ecf 100644 --- a/src/hats/catalog/partition_info.py +++ b/src/hats/catalog/partition_info.py @@ -11,6 +11,7 @@ import pyarrow as pa from upath import UPath +import hats.pixel_math.healpix_shim as hp from hats.io import file_io, paths from hats.io.parquet_metadata import ( read_row_group_fragments, @@ -24,7 +25,6 @@ class PartitionInfo: """Container class for per-partition info.""" METADATA_ORDER_COLUMN_NAME = "Norder" - METADATA_DIR_COLUMN_NAME = "Dir" METADATA_PIXEL_COLUMN_NAME = "Npix" def __init__(self, pixel_list: List[HealpixPixel], catalog_base_dir: str = None) -> None: @@ -98,10 +98,9 @@ def write_to_metadata_files(self, catalog_path: str | Path | UPath | None = None batches = [ [ pa.RecordBatch.from_arrays( - [[pixel.order], [pixel.dir], [pixel.pixel]], + [[pixel.order], [pixel.pixel]], names=[ self.METADATA_ORDER_COLUMN_NAME, - self.METADATA_DIR_COLUMN_NAME, self.METADATA_PIXEL_COLUMN_NAME, ], ) @@ -254,12 +253,10 @@ def as_dataframe(self): partition_info_dict = { PartitionInfo.METADATA_ORDER_COLUMN_NAME: [], PartitionInfo.METADATA_PIXEL_COLUMN_NAME: [], - PartitionInfo.METADATA_DIR_COLUMN_NAME: [], } for pixel in self.pixel_list: partition_info_dict[PartitionInfo.METADATA_ORDER_COLUMN_NAME].append(pixel.order) partition_info_dict[PartitionInfo.METADATA_PIXEL_COLUMN_NAME].append(pixel.pixel) - partition_info_dict[PartitionInfo.METADATA_DIR_COLUMN_NAME].append(pixel.dir) return pd.DataFrame.from_dict(partition_info_dict) @classmethod @@ -272,3 +269,12 @@ def from_healpix(cls, healpix_pixels: List[HealpixPixel]) -> PartitionInfo: A `PartitionInfo` object with the same healpix pixels """ return cls(healpix_pixels) + + def calculate_fractional_coverage(self): + """Calculate what fraction of the sky is covered by partition tiles.""" + pixel_orders = [p.order for p in self.pixel_list] + cov_order, cov_count = np.unique(pixel_orders, return_counts=True) + area_by_order = [hp.nside2pixarea(hp.order2nside(order), degrees=True) for order in cov_order] + # 41253 is the number of square degrees in a sphere + # https://en.wikipedia.org/wiki/Square_degree + return (area_by_order * cov_count).sum() / 41253 diff --git a/src/hats/io/__init__.py b/src/hats/io/__init__.py index 4d245881..c52c3f42 100644 --- a/src/hats/io/__init__.py +++ b/src/hats/io/__init__.py @@ -15,4 +15,3 @@ pixel_catalog_file, pixel_directory, ) -from .write_metadata import write_partition_info diff --git a/src/hats/io/paths.py b/src/hats/io/paths.py index d0a8de7b..0076efd8 100644 --- a/src/hats/io/paths.py +++ b/src/hats/io/paths.py @@ -13,12 +13,13 @@ from hats.io.file_io.file_pointer import get_upath from hats.pixel_math.healpix_pixel import INVALID_PIXEL, HealpixPixel -ORDER_DIRECTORY_PREFIX = "Norder" -DIR_DIRECTORY_PREFIX = "Dir" -PIXEL_DIRECTORY_PREFIX = "Npix" -JOIN_ORDER_DIRECTORY_PREFIX = "join_Norder" -JOIN_DIR_DIRECTORY_PREFIX = "join_Dir" -JOIN_PIXEL_DIRECTORY_PREFIX = "join_Npix" +PARTITION_ORDER = "Norder" +PARTITION_DIR = "Dir" +PARTITION_PIXEL = "Npix" + +MARGIN_ORDER = "margin_Norder" +MARGIN_DIR = "margin_Dir" +MARGIN_PIXEL = "margin_Npix" PARTITION_INFO_FILENAME = "partition_info.csv" PARTITION_JOIN_INFO_FILENAME = "partition_join_info.csv" @@ -62,7 +63,7 @@ def pixel_directory( raise ValueError("One of pixel_number or directory_number is required to create pixel directory") return create_hive_directory_name( catalog_base_dir, - [ORDER_DIRECTORY_PREFIX, DIR_DIRECTORY_PREFIX], + [PARTITION_ORDER, PARTITION_DIR], [norder, ndir], ) @@ -127,9 +128,9 @@ def pixel_catalog_files( base_path + fs.sep.join( [ - f"{ORDER_DIRECTORY_PREFIX}={pixel.order}", - f"{DIR_DIRECTORY_PREFIX}={pixel.dir}", - f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet" + url_params, + f"{PARTITION_ORDER}={pixel.order}", + f"{PARTITION_DIR}={pixel.dir}", + f"{PARTITION_PIXEL}={pixel.pixel}.parquet" + url_params, ] ) for pixel in pixels @@ -193,9 +194,9 @@ def pixel_catalog_file( return ( catalog_base_dir - / f"{ORDER_DIRECTORY_PREFIX}={pixel.order}" - / f"{DIR_DIRECTORY_PREFIX}={pixel.dir}" - / f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet{url_params}" + / f"{PARTITION_ORDER}={pixel.order}" + / f"{PARTITION_DIR}={pixel.dir}" + / f"{PARTITION_PIXEL}={pixel.pixel}.parquet{url_params}" ) diff --git a/src/hats/io/write_metadata.py b/src/hats/io/write_metadata.py deleted file mode 100644 index 45cd76ab..00000000 --- a/src/hats/io/write_metadata.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Utility functions for writing metadata files""" - -from __future__ import annotations - -from pathlib import Path - -import numpy as np -import pandas as pd -from upath import UPath - -from hats.io import file_io, paths - - -def write_partition_info(catalog_base_dir: str | Path | UPath, destination_healpix_pixel_map: dict): - """Write all partition data to CSV file. - - Args: - catalog_base_dir (str): base directory for catalog, where file will be written - destination_healpix_pixel_map (dict): dictionary that maps the HealpixPixel to a - tuple of origin pixel information: - - - 0 - the total number of rows found in this destination pixel - - 1 - the set of indexes in histogram for the pixels at the original healpix order - """ - partition_info_pointer = paths.get_partition_info_pointer(catalog_base_dir) - data_frame = pd.DataFrame(destination_healpix_pixel_map.keys()) - # Set column names. - data_frame.columns = [ - "Norder", - "Npix", - ] - data_frame["num_rows"] = [pixel_info[0] for pixel_info in destination_healpix_pixel_map.values()] - data_frame["Dir"] = [int(x / 10_000) * 10_000 for x in data_frame["Npix"]] - - # Reorder the columns to match full path, and force to integer types. - data_frame = data_frame[["Norder", "Dir", "Npix", "num_rows"]].astype(int) - - file_io.write_dataframe_to_csv(data_frame, partition_info_pointer, index=False) - - -def write_fits_map(catalog_path, histogram: np.ndarray): - """Write the object spatial distribution information to a healpix FITS file. - - Args: - catalog_path (str): base path for the catalog - histogram (:obj:`np.ndarray`): one-dimensional numpy array of long integers where the - value at each index corresponds to the number of objects found at the healpix pixel. - """ - catalog_base_dir = file_io.get_upath(catalog_path) - map_file_pointer = paths.get_point_map_file_pointer(catalog_base_dir) - file_io.write_fits_image(histogram, map_file_pointer) diff --git a/tests/conftest.py b/tests/conftest.py index 134e223b..9f6b618d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -53,7 +53,6 @@ def catalog_info_data() -> dict: "catalog_name": "test_name", "catalog_type": "object", "total_rows": 10, - "epoch": "J2000", "ra_column": "ra", "dec_column": "dec", } @@ -91,7 +90,6 @@ def source_catalog_info() -> dict: "catalog_name": "test_source", "catalog_type": "source", "total_rows": 100, - "epoch": "J2000", "ra_column": "source_ra", "dec_column": "source_dec", } @@ -103,7 +101,6 @@ def margin_cache_catalog_info_data() -> dict: "catalog_name": "test_margin", "catalog_type": "margin", "total_rows": 100, - "epoch": "J2000", "ra_column": "ra", "dec_column": "dec", "primary_catalog": "test_name", diff --git a/tests/data/almanac/small_sky.yml b/tests/data/almanac/small_sky.yml index 2ed8426f..a6085098 100644 --- a/tests/data/almanac/small_sky.yml +++ b/tests/data/almanac/small_sky.yml @@ -4,7 +4,6 @@ catalog_path: $HATS_DEFAULT_DIR/small_sky catalog_info: catalog_name: small_sky catalog_type: object - epoch: J2000 ra_column: ra dec_column: dec total_rows: 131 diff --git a/tests/data/almanac/small_sky_order1_margin.yml b/tests/data/almanac/small_sky_order1_margin.yml index ab2d3b23..d8b6502e 100644 --- a/tests/data/almanac/small_sky_order1_margin.yml +++ b/tests/data/almanac/small_sky_order1_margin.yml @@ -7,7 +7,6 @@ catalog_info: catalog_name: small_sky_order1_margin catalog_type: margin total_rows: 28 - epoch: J2000 ra_column: ra dec_column: dec primary_catalog: small_sky_order1 diff --git a/tests/data/almanac/small_sky_source.yml b/tests/data/almanac/small_sky_source.yml index 5125f3d0..0f705126 100644 --- a/tests/data/almanac/small_sky_source.yml +++ b/tests/data/almanac/small_sky_source.yml @@ -5,7 +5,6 @@ catalog_info: catalog_name: small_sky_source catalog_type: source total_rows: 17161 - epoch: J2000 ra_column: source_ra dec_column: source_dec primary_catalog: small_sky diff --git a/tests/data/almanac_exception/margin_missing_primary.yml b/tests/data/almanac_exception/margin_missing_primary.yml index 3b84393d..808713b2 100644 --- a/tests/data/almanac_exception/margin_missing_primary.yml +++ b/tests/data/almanac_exception/margin_missing_primary.yml @@ -5,7 +5,6 @@ catalog_info: catalog_name: margin_cache catalog_type: margin total_rows: 100 - epoch: J2000 ra_column: ra dec_column: dec primary_catalog: NOT_A_CATALOG diff --git a/tests/data/almanac_exception/standalone_source_catalog.yml b/tests/data/almanac_exception/standalone_source_catalog.yml index 2e3c1fb0..4eb8cb06 100644 --- a/tests/data/almanac_exception/standalone_source_catalog.yml +++ b/tests/data/almanac_exception/standalone_source_catalog.yml @@ -5,7 +5,6 @@ catalog_info: catalog_name: small_sky_source catalog_type: source total_rows: 17161 - epoch: J2000 ra_column: source_ra dec_column: source_dec mjd_column: mjd diff --git a/tests/data/info_only/catalog/properties b/tests/data/info_only/catalog/properties index 1701ec63..37f938d3 100644 --- a/tests/data/info_only/catalog/properties +++ b/tests/data/info_only/catalog/properties @@ -4,5 +4,6 @@ dataproduct_type=object hats_nrows=10 hats_col_j2000_ra=ra hats_col_j2000_dec=dec +hats_cols_default=psf mean_mag_r hats_max_rows=1000 hats_order=0 diff --git a/tests/hats/catalog/dataset/test_table_properties.py b/tests/hats/catalog/dataset/test_table_properties.py index eb3a0e3d..940ac9f3 100644 --- a/tests/hats/catalog/dataset/test_table_properties.py +++ b/tests/hats/catalog/dataset/test_table_properties.py @@ -25,11 +25,11 @@ def test_properties_parsing(): extra_columns="a , b", indexing_column="a", primary_catalog="bar", - unexpected_kwarg="how did this get here", + hats_copyright="LINCC Frameworks 2024", ) assert table_properties.extra_columns == ["a", "b"] - # unexpected_kwarg is not part of the named args, so it shouldn't show up in the debug string + # hats_copyright is not part of the named args, so it shouldn't show up in the debug string assert ( str(table_properties) == """ catalog_name foo @@ -47,7 +47,7 @@ def test_properties_parsing(): extra_columns=["a", "b"], indexing_column="a", primary_catalog="bar", - unexpected_kwarg="how did this get here", + hats_copyright="LINCC Frameworks 2024", ) assert table_properties_using_list == table_properties @@ -73,6 +73,49 @@ def test_properties_allowed_required(): join_column="b", ) + # extra_columnsss is a typo + with pytest.raises(ValueError, match="extra_columnsss"): + TableProperties( + catalog_name="foo", + catalog_type="index", + total_rows=15, + indexing_column="a", + primary_catalog="bar", + extra_columnsss=["beep"], + ) + + +def test_copy_and_update(): + initital_properties = TableProperties( + catalog_name="foo", + catalog_type="index", + total_rows=15, + indexing_column="a", + primary_catalog="bar", + ) + prop_a = initital_properties.copy_and_update() + assert initital_properties == prop_a + + prop_b = initital_properties.copy_and_update(catalog_name="bar") + assert initital_properties != prop_b + assert prop_b.catalog_name == "bar" + + prop_d = initital_properties.copy_and_update(**{"catalog_name": "bar"}) + assert initital_properties != prop_d + assert prop_d.catalog_name == "bar" + assert prop_b == prop_d + + prop_c = initital_properties.copy_and_update(moc_sky_fraction=0.54) + assert initital_properties != prop_c + assert prop_c.__pydantic_extra__["moc_sky_fraction"] == pytest.approx(0.54) + + # extra_columnsss is a typo + with pytest.raises(ValueError, match="extra_columnsss"): + initital_properties.copy_and_update(extra_columnsss=0.54) + + with pytest.raises(ValueError, match="extra_columnsss"): + initital_properties.copy_and_update(**{"extra_columnsss": 0.54}) + def test_read_from_dir_branches( small_sky_dir, diff --git a/tests/hats/catalog/test_partition_info.py b/tests/hats/catalog/test_partition_info.py index dd869ce3..9389a964 100644 --- a/tests/hats/catalog/test_partition_info.py +++ b/tests/hats/catalog/test_partition_info.py @@ -105,6 +105,16 @@ def test_get_highest_order(small_sky_order1_dir): assert highest_order == 1 +def test_calculate_fractional_coverage(small_sky_order1_dir): + """test the `calculate_fractional_coverage` method""" + partition_info_file = paths.get_parquet_metadata_pointer(small_sky_order1_dir) + partitions = PartitionInfo.read_from_file(partition_info_file) + + fractional_coverage = partitions.calculate_fractional_coverage() + + assert fractional_coverage == pytest.approx(0.083, abs=0.001) + + def test_write_to_file(tmp_path, small_sky_pixels): """Write out the partition info to file and make sure we can read it again.""" partition_info_pointer = paths.get_parquet_metadata_pointer(tmp_path) diff --git a/tests/hats/io/test_write_metadata.py b/tests/hats/io/test_write_metadata.py deleted file mode 100644 index 8fdb1e07..00000000 --- a/tests/hats/io/test_write_metadata.py +++ /dev/null @@ -1,91 +0,0 @@ -"""Tests of file IO (reads and writes)""" - -import numpy as np -import numpy.testing as npt -import pytest - -import hats.io.write_metadata as io -import hats.pixel_math as hist -import hats.pixel_math.healpix_shim as hp -from hats.io import file_io -from hats.pixel_math.healpix_pixel import HealpixPixel - - -def test_write_partition_info_healpix_pixel_map(assert_text_file_matches, tmp_path): - """Test that we accurately write out the partition stats for overloaded input""" - catalog_base_dir = tmp_path / "test_name" - file_io.make_directory(catalog_base_dir) - expected_lines = [ - "Norder,Dir,Npix,num_rows", - "0,0,11,131", - ] - pixel_map = {HealpixPixel(0, 11): (131, [11])} - io.write_partition_info(catalog_base_dir, destination_healpix_pixel_map=pixel_map) - metadata_filename = catalog_base_dir / "partition_info.csv" - assert_text_file_matches(expected_lines, metadata_filename) - - expected_lines = [ - "Norder,Dir,Npix,num_rows", - "1,0,44,51", - "1,0,45,29", - "1,0,46,51", - ] - pixel_map = { - HealpixPixel(1, 44): (51, [44]), - HealpixPixel(1, 45): (29, [45]), - HealpixPixel(1, 46): (51, [46]), - } - io.write_partition_info(catalog_base_dir, destination_healpix_pixel_map=pixel_map) - metadata_filename = catalog_base_dir / "partition_info.csv" - assert_text_file_matches(expected_lines, metadata_filename) - - -def test_write_partition_info_float(assert_text_file_matches, tmp_path): - """Test that we accurately write out the individual partition stats - even when the input is floats instead of ints""" - catalog_base_dir = tmp_path / "test_name" - file_io.make_directory(catalog_base_dir) - expected_lines = [ - "Norder,Dir,Npix,num_rows", - "0,0,11,131", - ] - pixel_map = {HealpixPixel(0.0, 11.0): (131, [44.0, 45.0, 46.0])} - io.write_partition_info(catalog_base_dir, pixel_map) - metadata_filename = catalog_base_dir / "partition_info.csv" - assert_text_file_matches(expected_lines, metadata_filename) - - -def test_read_write_fits_point_map(tmp_path): - """Check that we write and can read a FITS file for spatial distribution.""" - initial_histogram = hist.empty_histogram(1) - filled_pixels = [51, 29, 51, 0] - initial_histogram[44:] = filled_pixels[:] - io.write_fits_map(tmp_path, initial_histogram) - - output_file = tmp_path / "point_map.fits" - - output = file_io.read_fits_image(output_file) - npt.assert_array_equal(output, initial_histogram) - - # Check the metadata of the fits file: - map_fits_image = hp.read_map(output_file, nest=True, h=True) - - header_dict = dict(map_fits_image[1]) - assert header_dict["ORDERING"] == "NESTED" - assert header_dict["PIXTYPE"] == "HEALPIX" - assert header_dict["NSIDE"] == 2 - - npt.assert_array_equal(initial_histogram, map_fits_image[0]) - - -def test_read_ring_fits_point_map(tmp_path): - """Check that we write and can read a FITS file for spatial distribution.""" - output_file = tmp_path / "point_map.fits" - initial_histogram = hist.empty_histogram(1) - filled_pixels = [51, 29, 51, 0] - initial_histogram[44:] = filled_pixels[:] - hp.write_map(output_file, initial_histogram, dtype=np.int64) - - with pytest.warns(UserWarning, match="/hats/issues/271"): - output = file_io.read_fits_image(output_file) - npt.assert_array_equal(output, initial_histogram)