Skip to content

Commit

Permalink
Add helper functions read_all_files and backup_sdss_id_tables
Browse files Browse the repository at this point in the history
  • Loading branch information
albireox committed Jan 27, 2025
1 parent d44b216 commit a9a0d8c
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 0 deletions.
46 changes: 46 additions & 0 deletions src/too/carton.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from __future__ import annotations

import pathlib
import warnings

from typing import TYPE_CHECKING
Expand Down Expand Up @@ -43,6 +44,51 @@ def run_too_carton():
too_carton.load(mode="append")


def backup_sdss_id_tables(
database: PeeweeDatabaseConnection,
tables=[
"sdss_id_flat",
"sdss_id_flat_addendum",
"sdss_id_stacked",
"sdss_id_stacked_addendum",
],
schema="sandbox",
outdir: pathlib.Path | str = ".",
suffix: str = "",
):
"""Backs up the SDSS ID tables.
Parameters
----------
database
The database connection.
tables
The tables to backup. Each table is backed up as a separate file in
``outdir`` with the format ``<schema>_<table_name>_<suffix>.csv``.
schema
The schema where the tables are located.
outdir
The output directory for the backup files. Defaults to the current
directory.
suffix
A suffix to add to the backup files.
"""

outdir = pathlib.Path(outdir).absolute()
outdir.mkdir(parents=True, exist_ok=True)

if suffix:
suffix = f"_{suffix}"

assert database.connected, "Database connection must be established."

for table in tables:
cursor = database.cursor()
with open(outdir / f"{schema}_{table}{suffix}.csv", "w") as file:
cursor.copy_expert(f"COPY {schema}.{table} TO STDOUT WITH CSV HEADER", file)


def update_sdss_id_tables(database: PeeweeDatabaseConnection):
"""Updates the SDSS ID tables."""

Expand Down
52 changes: 52 additions & 0 deletions src/too/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from sdsstools import get_sjd

from too import log
from too.datamodel import too_dtypes
from too.exceptions import ValidationError

Expand Down Expand Up @@ -81,6 +82,57 @@ def read_too_file(
return targets


def read_all_files(
path: pathlib.Path | str,
ignore_invalid: bool = False,
sort: bool = True,
silent: bool = False,
) -> polars.DataFrame:
"""Reads all ToO files in a directory.
Parameters
----------
path
The path to the directory containing the files. All CSV and Parquet files
will be read.
ignore_invalid
If ``True``, ignores files that cannot be read.
sort
If ``True``, sorts the resulting dataframe by ``added_on`` and ``too_id``.
silent
If ``True``, does not print any output to the console log.
"""

path = pathlib.Path(path)

process_files: list[pathlib.Path] = []
process_files.extend(path.glob("*.csv"))
process_files.extend(path.glob("*.parquet"))

targets = polars.DataFrame({}, schema=too_dtypes)
for file in process_files:
try:
new_targets = read_too_file(file, cast=True)

except Exception as ee:
if not silent:
log.error(f"Failed to read file {file}: {ee}")

if ignore_invalid:
continue
else:
raise

else:
targets = targets.vstack(new_targets)

if sort:
targets = targets.sort(["added_on", "too_id"])

return targets


def deduplicate_too_targets(targets: polars.DataFrame) -> polars.DataFrame:
"""Deduplicates a list of ToO targets preferring the latest ``added_on`` value."""

Expand Down

0 comments on commit a9a0d8c

Please sign in to comment.