Skip to content

Commit

Permalink
Add default column to properties (#359)
Browse files Browse the repository at this point in the history
* Add default column to properties

* Add fractional coverage on partition info

* Add convenience import.

* Clean it up

* Comment about square degrees.

* Add copy_and_update
  • Loading branch information
delucchi-cmu authored Sep 27, 2024
1 parent 8db8e58 commit f177019
Show file tree
Hide file tree
Showing 17 changed files with 137 additions and 178 deletions.
1 change: 0 additions & 1 deletion benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def time_test_cone_filter_multiple_order():
"catalog_name": "test_name",
"catalog_type": "object",
"total_rows": 10,
"epoch": "J2000",
"ra_column": "ra",
"dec_column": "dec",
}
Expand Down
1 change: 1 addition & 0 deletions src/hats/catalog/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Catalog data wrappers"""

from .association_catalog import AssociationCatalog
from .association_catalog.partition_join_info import PartitionJoinInfo
from .catalog import Catalog
from .catalog_type import CatalogType
from .dataset.dataset import Dataset
Expand Down
59 changes: 54 additions & 5 deletions src/hats/catalog/dataset/table_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

## catalog_name, catalog_type, and total_rows are allowed for ALL types
CATALOG_TYPE_ALLOWED_FIELDS = {
CatalogType.OBJECT: ["ra_column", "dec_column"],
CatalogType.SOURCE: ["primary_catalog", "ra_column", "dec_column"],
CatalogType.OBJECT: ["ra_column", "dec_column", "default_columns"],
CatalogType.SOURCE: ["primary_catalog", "ra_column", "dec_column", "default_columns"],
CatalogType.ASSOCIATION: [
"primary_catalog",
"primary_column",
Expand All @@ -24,7 +24,7 @@
"contains_leaf_files",
],
CatalogType.INDEX: ["primary_catalog", "indexing_column", "extra_columns"],
CatalogType.MARGIN: ["primary_catalog", "margin_threshold", "ra_column", "dec_column"],
CatalogType.MARGIN: ["primary_catalog", "margin_threshold", "ra_column", "dec_column", "default_columns"],
}

## catalog_name, catalog_type, and total_rows are required for ALL types
Expand All @@ -42,6 +42,41 @@
CatalogType.MARGIN: ["primary_catalog", "margin_threshold"],
}

# All additional properties in the HATS recommendation.
EXTRA_ALLOWED_FIELDS = [
"addendum_did",
"bib_reference",
"bib_reference_url",
"creator_did",
"data_ucd",
"hats_builder",
"hats_cols_sort",
"hats_cols_survey_id",
"hats_copyright",
"hats_creation_date",
"hats_creator",
"hats_estsize",
"hats_frame",
"hats_max_rows",
"hats_order",
"hats_progenitor_url",
"hats_release_date",
"hats_service_url",
"hats_status",
"hats_version",
"moc_sky_fraction",
"obs_ack",
"obs_copyright",
"obs_copyright_url",
"obs_description",
"obs_regime",
"obs_title",
"prov_progenitor",
"publisher_id",
"t_max",
"t_min",
]


class TableProperties(BaseModel):
"""Container class for catalog metadata"""
Expand All @@ -52,6 +87,8 @@ class TableProperties(BaseModel):

ra_column: Optional[str] = Field(default=None, alias="hats_col_j2000_ra")
dec_column: Optional[str] = Field(default=None, alias="hats_col_j2000_dec")
default_columns: Optional[List[str]] = Field(default=None, alias="hats_cols_default")
"""Which columns should be read from parquet files, when user doesn't otherwise specify."""

primary_catalog: Optional[str] = Field(default=None, alias="hats_primary_table_url")
"""Reference to object catalog. Relevant for nested, margin, association, and index."""
Expand Down Expand Up @@ -86,7 +123,7 @@ class TableProperties(BaseModel):
## Allow any extra keyword args to be stored on the properties object.
model_config = ConfigDict(extra="allow", populate_by_name=True, use_enum_values=True)

@field_validator("extra_columns", mode="before")
@field_validator("default_columns", "extra_columns", mode="before")
@classmethod
def space_delimited_list(cls, str_value: str) -> List[str]:
"""Convert a space-delimited list string into a python list of strings."""
Expand All @@ -95,9 +132,11 @@ def space_delimited_list(cls, str_value: str) -> List[str]:
return list(filter(None, re.split(";| |,|\n", str_value)))
return str_value

@field_serializer("extra_columns")
@field_serializer("default_columns", "extra_columns")
def serialize_as_space_delimited_list(self, str_list: Iterable[str]) -> str:
"""Convert a python list of strings into a space-delimited string."""
if str_list is None:
return None
return " ".join(str_list)

@model_validator(mode="after")
Expand All @@ -122,8 +161,18 @@ def check_allowed_and_required(self) -> Self:
raise ValueError(
f"Missing required property for table type {self.catalog_type}: {missing_required}"
)

# Check against all known properties - catches typos.
non_allowed = set(self.__pydantic_extra__.keys()) - set(EXTRA_ALLOWED_FIELDS)
if len(non_allowed) > 0:
raise ValueError(f"Unexpected extra property: {non_allowed}")
return self

def copy_and_update(self, **kwargs):
new_properties = self.model_copy(update=kwargs)
TableProperties.model_validate(new_properties)
return new_properties

def explicit_dict(self):
"""Create a dict, based on fields that have been explicitly set, and are not "extra" keys."""
explicit = self.model_dump(by_alias=False, exclude_none=True)
Expand Down
16 changes: 11 additions & 5 deletions src/hats/catalog/partition_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pyarrow as pa
from upath import UPath

import hats.pixel_math.healpix_shim as hp
from hats.io import file_io, paths
from hats.io.parquet_metadata import (
read_row_group_fragments,
Expand All @@ -24,7 +25,6 @@ class PartitionInfo:
"""Container class for per-partition info."""

METADATA_ORDER_COLUMN_NAME = "Norder"
METADATA_DIR_COLUMN_NAME = "Dir"
METADATA_PIXEL_COLUMN_NAME = "Npix"

def __init__(self, pixel_list: List[HealpixPixel], catalog_base_dir: str = None) -> None:
Expand Down Expand Up @@ -98,10 +98,9 @@ def write_to_metadata_files(self, catalog_path: str | Path | UPath | None = None
batches = [
[
pa.RecordBatch.from_arrays(
[[pixel.order], [pixel.dir], [pixel.pixel]],
[[pixel.order], [pixel.pixel]],
names=[
self.METADATA_ORDER_COLUMN_NAME,
self.METADATA_DIR_COLUMN_NAME,
self.METADATA_PIXEL_COLUMN_NAME,
],
)
Expand Down Expand Up @@ -254,12 +253,10 @@ def as_dataframe(self):
partition_info_dict = {
PartitionInfo.METADATA_ORDER_COLUMN_NAME: [],
PartitionInfo.METADATA_PIXEL_COLUMN_NAME: [],
PartitionInfo.METADATA_DIR_COLUMN_NAME: [],
}
for pixel in self.pixel_list:
partition_info_dict[PartitionInfo.METADATA_ORDER_COLUMN_NAME].append(pixel.order)
partition_info_dict[PartitionInfo.METADATA_PIXEL_COLUMN_NAME].append(pixel.pixel)
partition_info_dict[PartitionInfo.METADATA_DIR_COLUMN_NAME].append(pixel.dir)
return pd.DataFrame.from_dict(partition_info_dict)

@classmethod
Expand All @@ -272,3 +269,12 @@ def from_healpix(cls, healpix_pixels: List[HealpixPixel]) -> PartitionInfo:
A `PartitionInfo` object with the same healpix pixels
"""
return cls(healpix_pixels)

def calculate_fractional_coverage(self):
"""Calculate what fraction of the sky is covered by partition tiles."""
pixel_orders = [p.order for p in self.pixel_list]
cov_order, cov_count = np.unique(pixel_orders, return_counts=True)
area_by_order = [hp.nside2pixarea(hp.order2nside(order), degrees=True) for order in cov_order]
# 41253 is the number of square degrees in a sphere
# https://en.wikipedia.org/wiki/Square_degree
return (area_by_order * cov_count).sum() / 41253
1 change: 0 additions & 1 deletion src/hats/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,3 @@
pixel_catalog_file,
pixel_directory,
)
from .write_metadata import write_partition_info
27 changes: 14 additions & 13 deletions src/hats/io/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
from hats.io.file_io.file_pointer import get_upath
from hats.pixel_math.healpix_pixel import INVALID_PIXEL, HealpixPixel

ORDER_DIRECTORY_PREFIX = "Norder"
DIR_DIRECTORY_PREFIX = "Dir"
PIXEL_DIRECTORY_PREFIX = "Npix"
JOIN_ORDER_DIRECTORY_PREFIX = "join_Norder"
JOIN_DIR_DIRECTORY_PREFIX = "join_Dir"
JOIN_PIXEL_DIRECTORY_PREFIX = "join_Npix"
PARTITION_ORDER = "Norder"
PARTITION_DIR = "Dir"
PARTITION_PIXEL = "Npix"

MARGIN_ORDER = "margin_Norder"
MARGIN_DIR = "margin_Dir"
MARGIN_PIXEL = "margin_Npix"

PARTITION_INFO_FILENAME = "partition_info.csv"
PARTITION_JOIN_INFO_FILENAME = "partition_join_info.csv"
Expand Down Expand Up @@ -62,7 +63,7 @@ def pixel_directory(
raise ValueError("One of pixel_number or directory_number is required to create pixel directory")
return create_hive_directory_name(
catalog_base_dir,
[ORDER_DIRECTORY_PREFIX, DIR_DIRECTORY_PREFIX],
[PARTITION_ORDER, PARTITION_DIR],
[norder, ndir],
)

Expand Down Expand Up @@ -127,9 +128,9 @@ def pixel_catalog_files(
base_path
+ fs.sep.join(
[
f"{ORDER_DIRECTORY_PREFIX}={pixel.order}",
f"{DIR_DIRECTORY_PREFIX}={pixel.dir}",
f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet" + url_params,
f"{PARTITION_ORDER}={pixel.order}",
f"{PARTITION_DIR}={pixel.dir}",
f"{PARTITION_PIXEL}={pixel.pixel}.parquet" + url_params,
]
)
for pixel in pixels
Expand Down Expand Up @@ -193,9 +194,9 @@ def pixel_catalog_file(

return (
catalog_base_dir
/ f"{ORDER_DIRECTORY_PREFIX}={pixel.order}"
/ f"{DIR_DIRECTORY_PREFIX}={pixel.dir}"
/ f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet{url_params}"
/ f"{PARTITION_ORDER}={pixel.order}"
/ f"{PARTITION_DIR}={pixel.dir}"
/ f"{PARTITION_PIXEL}={pixel.pixel}.parquet{url_params}"
)


Expand Down
51 changes: 0 additions & 51 deletions src/hats/io/write_metadata.py

This file was deleted.

3 changes: 0 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def catalog_info_data() -> dict:
"catalog_name": "test_name",
"catalog_type": "object",
"total_rows": 10,
"epoch": "J2000",
"ra_column": "ra",
"dec_column": "dec",
}
Expand Down Expand Up @@ -91,7 +90,6 @@ def source_catalog_info() -> dict:
"catalog_name": "test_source",
"catalog_type": "source",
"total_rows": 100,
"epoch": "J2000",
"ra_column": "source_ra",
"dec_column": "source_dec",
}
Expand All @@ -103,7 +101,6 @@ def margin_cache_catalog_info_data() -> dict:
"catalog_name": "test_margin",
"catalog_type": "margin",
"total_rows": 100,
"epoch": "J2000",
"ra_column": "ra",
"dec_column": "dec",
"primary_catalog": "test_name",
Expand Down
1 change: 0 additions & 1 deletion tests/data/almanac/small_sky.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ catalog_path: $HATS_DEFAULT_DIR/small_sky
catalog_info:
catalog_name: small_sky
catalog_type: object
epoch: J2000
ra_column: ra
dec_column: dec
total_rows: 131
1 change: 0 additions & 1 deletion tests/data/almanac/small_sky_order1_margin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ catalog_info:
catalog_name: small_sky_order1_margin
catalog_type: margin
total_rows: 28
epoch: J2000
ra_column: ra
dec_column: dec
primary_catalog: small_sky_order1
Expand Down
1 change: 0 additions & 1 deletion tests/data/almanac/small_sky_source.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ catalog_info:
catalog_name: small_sky_source
catalog_type: source
total_rows: 17161
epoch: J2000
ra_column: source_ra
dec_column: source_dec
primary_catalog: small_sky
Expand Down
1 change: 0 additions & 1 deletion tests/data/almanac_exception/margin_missing_primary.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ catalog_info:
catalog_name: margin_cache
catalog_type: margin
total_rows: 100
epoch: J2000
ra_column: ra
dec_column: dec
primary_catalog: NOT_A_CATALOG
Expand Down
1 change: 0 additions & 1 deletion tests/data/almanac_exception/standalone_source_catalog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ catalog_info:
catalog_name: small_sky_source
catalog_type: source
total_rows: 17161
epoch: J2000
ra_column: source_ra
dec_column: source_dec
mjd_column: mjd
Expand Down
1 change: 1 addition & 0 deletions tests/data/info_only/catalog/properties
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ dataproduct_type=object
hats_nrows=10
hats_col_j2000_ra=ra
hats_col_j2000_dec=dec
hats_cols_default=psf mean_mag_r
hats_max_rows=1000
hats_order=0
Loading

0 comments on commit f177019

Please sign in to comment.