diff --git a/src/too/carton.py b/src/too/carton.py index a276a99..94d3a51 100644 --- a/src/too/carton.py +++ b/src/too/carton.py @@ -8,6 +8,7 @@ from __future__ import annotations +import pathlib import warnings from typing import TYPE_CHECKING @@ -43,6 +44,51 @@ def run_too_carton(): too_carton.load(mode="append") +def backup_sdss_id_tables( + database: PeeweeDatabaseConnection, + tables=[ + "sdss_id_flat", + "sdss_id_flat_addendum", + "sdss_id_stacked", + "sdss_id_stacked_addendum", + ], + schema="sandbox", + outdir: pathlib.Path | str = ".", + suffix: str = "", +): + """Backs up the SDSS ID tables. + + Parameters + ---------- + database + The database connection. + tables + The tables to backup. Each table is backed up as a separate file in + ``outdir`` with the format ``__.csv``. + schema + The schema where the tables are located. + outdir + The output directory for the backup files. Defaults to the current + directory. + suffix + A suffix to add to the backup files. + + """ + + outdir = pathlib.Path(outdir).absolute() + outdir.mkdir(parents=True, exist_ok=True) + + if suffix: + suffix = f"_{suffix}" + + assert database.connected, "Database connection must be established." + + for table in tables: + cursor = database.cursor() + with open(outdir / f"{schema}_{table}{suffix}.csv", "w") as file: + cursor.copy_expert(f"COPY {schema}.{table} TO STDOUT WITH CSV HEADER", file) + + def update_sdss_id_tables(database: PeeweeDatabaseConnection): """Updates the SDSS ID tables.""" diff --git a/src/too/tools.py b/src/too/tools.py index be775a6..b9e8f08 100644 --- a/src/too/tools.py +++ b/src/too/tools.py @@ -23,6 +23,7 @@ from sdsstools import get_sjd +from too import log from too.datamodel import too_dtypes from too.exceptions import ValidationError @@ -81,6 +82,57 @@ def read_too_file( return targets +def read_all_files( + path: pathlib.Path | str, + ignore_invalid: bool = False, + sort: bool = True, + silent: bool = False, +) -> polars.DataFrame: + """Reads all ToO files in a directory. + + Parameters + ---------- + path + The path to the directory containing the files. All CSV and Parquet files + will be read. + ignore_invalid + If ``True``, ignores files that cannot be read. + sort + If ``True``, sorts the resulting dataframe by ``added_on`` and ``too_id``. + silent + If ``True``, does not print any output to the console log. + + """ + + path = pathlib.Path(path) + + process_files: list[pathlib.Path] = [] + process_files.extend(path.glob("*.csv")) + process_files.extend(path.glob("*.parquet")) + + targets = polars.DataFrame({}, schema=too_dtypes) + for file in process_files: + try: + new_targets = read_too_file(file, cast=True) + + except Exception as ee: + if not silent: + log.error(f"Failed to read file {file}: {ee}") + + if ignore_invalid: + continue + else: + raise + + else: + targets = targets.vstack(new_targets) + + if sort: + targets = targets.sort(["added_on", "too_id"]) + + return targets + + def deduplicate_too_targets(targets: polars.DataFrame) -> polars.DataFrame: """Deduplicates a list of ToO targets preferring the latest ``added_on`` value."""