Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add default column to properties #359

Merged
merged 6 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def time_test_cone_filter_multiple_order():
"catalog_name": "test_name",
"catalog_type": "object",
"total_rows": 10,
"epoch": "J2000",
"ra_column": "ra",
"dec_column": "dec",
}
Expand Down
1 change: 1 addition & 0 deletions src/hats/catalog/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Catalog data wrappers"""

from .association_catalog import AssociationCatalog
from .association_catalog.partition_join_info import PartitionJoinInfo
from .catalog import Catalog
from .catalog_type import CatalogType
from .dataset.dataset import Dataset
Expand Down
59 changes: 54 additions & 5 deletions src/hats/catalog/dataset/table_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

## catalog_name, catalog_type, and total_rows are allowed for ALL types
CATALOG_TYPE_ALLOWED_FIELDS = {
CatalogType.OBJECT: ["ra_column", "dec_column"],
CatalogType.SOURCE: ["primary_catalog", "ra_column", "dec_column"],
CatalogType.OBJECT: ["ra_column", "dec_column", "default_columns"],
CatalogType.SOURCE: ["primary_catalog", "ra_column", "dec_column", "default_columns"],
CatalogType.ASSOCIATION: [
"primary_catalog",
"primary_column",
Expand All @@ -24,7 +24,7 @@
"contains_leaf_files",
],
CatalogType.INDEX: ["primary_catalog", "indexing_column", "extra_columns"],
CatalogType.MARGIN: ["primary_catalog", "margin_threshold", "ra_column", "dec_column"],
CatalogType.MARGIN: ["primary_catalog", "margin_threshold", "ra_column", "dec_column", "default_columns"],
}

## catalog_name, catalog_type, and total_rows are required for ALL types
Expand All @@ -42,6 +42,41 @@
CatalogType.MARGIN: ["primary_catalog", "margin_threshold"],
}

# All additional properties in the HATS recommendation.
EXTRA_ALLOWED_FIELDS = [
"addendum_did",
"bib_reference",
"bib_reference_url",
"creator_did",
"data_ucd",
"hats_builder",
"hats_cols_sort",
"hats_cols_survey_id",
"hats_copyright",
"hats_creation_date",
"hats_creator",
"hats_estsize",
"hats_frame",
"hats_max_rows",
"hats_order",
"hats_progenitor_url",
"hats_release_date",
"hats_service_url",
"hats_status",
"hats_version",
"moc_sky_fraction",
"obs_ack",
"obs_copyright",
"obs_copyright_url",
"obs_description",
"obs_regime",
"obs_title",
"prov_progenitor",
"publisher_id",
"t_max",
"t_min",
]


class TableProperties(BaseModel):
"""Container class for catalog metadata"""
Expand All @@ -52,6 +87,8 @@

ra_column: Optional[str] = Field(default=None, alias="hats_col_j2000_ra")
dec_column: Optional[str] = Field(default=None, alias="hats_col_j2000_dec")
default_columns: Optional[List[str]] = Field(default=None, alias="hats_cols_default")
"""Which columns should be read from parquet files, when user doesn't otherwise specify."""

primary_catalog: Optional[str] = Field(default=None, alias="hats_primary_table_url")
"""Reference to object catalog. Relevant for nested, margin, association, and index."""
Expand Down Expand Up @@ -86,7 +123,7 @@
## Allow any extra keyword args to be stored on the properties object.
model_config = ConfigDict(extra="allow", populate_by_name=True, use_enum_values=True)

@field_validator("extra_columns", mode="before")
@field_validator("default_columns", "extra_columns", mode="before")
@classmethod
def space_delimited_list(cls, str_value: str) -> List[str]:
"""Convert a space-delimited list string into a python list of strings."""
Expand All @@ -95,9 +132,11 @@
return list(filter(None, re.split(";| |,|\n", str_value)))
return str_value

@field_serializer("extra_columns")
@field_serializer("default_columns", "extra_columns")
def serialize_as_space_delimited_list(self, str_list: Iterable[str]) -> str:
"""Convert a python list of strings into a space-delimited string."""
if str_list is None:
return None

Check warning on line 139 in src/hats/catalog/dataset/table_properties.py

View check run for this annotation

Codecov / codecov/patch

src/hats/catalog/dataset/table_properties.py#L139

Added line #L139 was not covered by tests
return " ".join(str_list)

@model_validator(mode="after")
Expand All @@ -122,8 +161,18 @@
raise ValueError(
f"Missing required property for table type {self.catalog_type}: {missing_required}"
)

# Check against all known properties - catches typos.
non_allowed = set(self.__pydantic_extra__.keys()) - set(EXTRA_ALLOWED_FIELDS)
delucchi-cmu marked this conversation as resolved.
Show resolved Hide resolved
if len(non_allowed) > 0:
raise ValueError(f"Unexpected extra property: {non_allowed}")
return self

def copy_and_update(self, **kwargs):
new_properties = self.model_copy(update=kwargs)
TableProperties.model_validate(new_properties)
return new_properties

def explicit_dict(self):
"""Create a dict, based on fields that have been explicitly set, and are not "extra" keys."""
explicit = self.model_dump(by_alias=False, exclude_none=True)
Expand Down
16 changes: 11 additions & 5 deletions src/hats/catalog/partition_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pyarrow as pa
from upath import UPath

import hats.pixel_math.healpix_shim as hp
from hats.io import file_io, paths
from hats.io.parquet_metadata import (
read_row_group_fragments,
Expand All @@ -24,7 +25,6 @@ class PartitionInfo:
"""Container class for per-partition info."""

METADATA_ORDER_COLUMN_NAME = "Norder"
METADATA_DIR_COLUMN_NAME = "Dir"
METADATA_PIXEL_COLUMN_NAME = "Npix"

def __init__(self, pixel_list: List[HealpixPixel], catalog_base_dir: str = None) -> None:
Expand Down Expand Up @@ -98,10 +98,9 @@ def write_to_metadata_files(self, catalog_path: str | Path | UPath | None = None
batches = [
[
pa.RecordBatch.from_arrays(
[[pixel.order], [pixel.dir], [pixel.pixel]],
[[pixel.order], [pixel.pixel]],
names=[
self.METADATA_ORDER_COLUMN_NAME,
self.METADATA_DIR_COLUMN_NAME,
self.METADATA_PIXEL_COLUMN_NAME,
],
)
Expand Down Expand Up @@ -254,12 +253,10 @@ def as_dataframe(self):
partition_info_dict = {
PartitionInfo.METADATA_ORDER_COLUMN_NAME: [],
PartitionInfo.METADATA_PIXEL_COLUMN_NAME: [],
PartitionInfo.METADATA_DIR_COLUMN_NAME: [],
}
for pixel in self.pixel_list:
partition_info_dict[PartitionInfo.METADATA_ORDER_COLUMN_NAME].append(pixel.order)
partition_info_dict[PartitionInfo.METADATA_PIXEL_COLUMN_NAME].append(pixel.pixel)
partition_info_dict[PartitionInfo.METADATA_DIR_COLUMN_NAME].append(pixel.dir)
return pd.DataFrame.from_dict(partition_info_dict)

@classmethod
Expand All @@ -272,3 +269,12 @@ def from_healpix(cls, healpix_pixels: List[HealpixPixel]) -> PartitionInfo:
A `PartitionInfo` object with the same healpix pixels
"""
return cls(healpix_pixels)

def calculate_fractional_coverage(self):
"""Calculate what fraction of the sky is covered by partition tiles."""
pixel_orders = [p.order for p in self.pixel_list]
cov_order, cov_count = np.unique(pixel_orders, return_counts=True)
area_by_order = [hp.nside2pixarea(hp.order2nside(order), degrees=True) for order in cov_order]
# 41253 is the number of square degrees in a sphere
# https://en.wikipedia.org/wiki/Square_degree
return (area_by_order * cov_count).sum() / 41253
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
1 change: 0 additions & 1 deletion src/hats/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,3 @@
pixel_catalog_file,
pixel_directory,
)
from .write_metadata import write_partition_info
27 changes: 14 additions & 13 deletions src/hats/io/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
from hats.io.file_io.file_pointer import get_upath
from hats.pixel_math.healpix_pixel import INVALID_PIXEL, HealpixPixel

ORDER_DIRECTORY_PREFIX = "Norder"
DIR_DIRECTORY_PREFIX = "Dir"
PIXEL_DIRECTORY_PREFIX = "Npix"
JOIN_ORDER_DIRECTORY_PREFIX = "join_Norder"
JOIN_DIR_DIRECTORY_PREFIX = "join_Dir"
JOIN_PIXEL_DIRECTORY_PREFIX = "join_Npix"
PARTITION_ORDER = "Norder"
PARTITION_DIR = "Dir"
delucchi-cmu marked this conversation as resolved.
Show resolved Hide resolved
PARTITION_PIXEL = "Npix"

MARGIN_ORDER = "margin_Norder"
MARGIN_DIR = "margin_Dir"
MARGIN_PIXEL = "margin_Npix"

PARTITION_INFO_FILENAME = "partition_info.csv"
PARTITION_JOIN_INFO_FILENAME = "partition_join_info.csv"
Expand Down Expand Up @@ -62,7 +63,7 @@ def pixel_directory(
raise ValueError("One of pixel_number or directory_number is required to create pixel directory")
return create_hive_directory_name(
catalog_base_dir,
[ORDER_DIRECTORY_PREFIX, DIR_DIRECTORY_PREFIX],
[PARTITION_ORDER, PARTITION_DIR],
[norder, ndir],
)

Expand Down Expand Up @@ -127,9 +128,9 @@ def pixel_catalog_files(
base_path
+ fs.sep.join(
[
f"{ORDER_DIRECTORY_PREFIX}={pixel.order}",
f"{DIR_DIRECTORY_PREFIX}={pixel.dir}",
f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet" + url_params,
f"{PARTITION_ORDER}={pixel.order}",
f"{PARTITION_DIR}={pixel.dir}",
f"{PARTITION_PIXEL}={pixel.pixel}.parquet" + url_params,
]
)
for pixel in pixels
Expand Down Expand Up @@ -193,9 +194,9 @@ def pixel_catalog_file(

return (
catalog_base_dir
/ f"{ORDER_DIRECTORY_PREFIX}={pixel.order}"
/ f"{DIR_DIRECTORY_PREFIX}={pixel.dir}"
/ f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet{url_params}"
/ f"{PARTITION_ORDER}={pixel.order}"
/ f"{PARTITION_DIR}={pixel.dir}"
/ f"{PARTITION_PIXEL}={pixel.pixel}.parquet{url_params}"
)


Expand Down
51 changes: 0 additions & 51 deletions src/hats/io/write_metadata.py

This file was deleted.

3 changes: 0 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def catalog_info_data() -> dict:
"catalog_name": "test_name",
"catalog_type": "object",
"total_rows": 10,
"epoch": "J2000",
"ra_column": "ra",
"dec_column": "dec",
}
Expand Down Expand Up @@ -91,7 +90,6 @@ def source_catalog_info() -> dict:
"catalog_name": "test_source",
"catalog_type": "source",
"total_rows": 100,
"epoch": "J2000",
"ra_column": "source_ra",
"dec_column": "source_dec",
}
Expand All @@ -103,7 +101,6 @@ def margin_cache_catalog_info_data() -> dict:
"catalog_name": "test_margin",
"catalog_type": "margin",
"total_rows": 100,
"epoch": "J2000",
"ra_column": "ra",
"dec_column": "dec",
"primary_catalog": "test_name",
Expand Down
1 change: 0 additions & 1 deletion tests/data/almanac/small_sky.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ catalog_path: $HATS_DEFAULT_DIR/small_sky
catalog_info:
catalog_name: small_sky
catalog_type: object
epoch: J2000
ra_column: ra
dec_column: dec
total_rows: 131
1 change: 0 additions & 1 deletion tests/data/almanac/small_sky_order1_margin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ catalog_info:
catalog_name: small_sky_order1_margin
catalog_type: margin
total_rows: 28
epoch: J2000
ra_column: ra
dec_column: dec
primary_catalog: small_sky_order1
Expand Down
1 change: 0 additions & 1 deletion tests/data/almanac/small_sky_source.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ catalog_info:
catalog_name: small_sky_source
catalog_type: source
total_rows: 17161
epoch: J2000
ra_column: source_ra
dec_column: source_dec
primary_catalog: small_sky
Expand Down
1 change: 0 additions & 1 deletion tests/data/almanac_exception/margin_missing_primary.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ catalog_info:
catalog_name: margin_cache
catalog_type: margin
total_rows: 100
epoch: J2000
ra_column: ra
dec_column: dec
primary_catalog: NOT_A_CATALOG
Expand Down
1 change: 0 additions & 1 deletion tests/data/almanac_exception/standalone_source_catalog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ catalog_info:
catalog_name: small_sky_source
catalog_type: source
total_rows: 17161
epoch: J2000
ra_column: source_ra
dec_column: source_dec
mjd_column: mjd
Expand Down
1 change: 1 addition & 0 deletions tests/data/info_only/catalog/properties
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ dataproduct_type=object
hats_nrows=10
hats_col_j2000_ra=ra
hats_col_j2000_dec=dec
hats_cols_default=psf mean_mag_r
hats_max_rows=1000
hats_order=0
Loading
Loading