Add helper functions read_all_files and backup_sdss_id_tables

sdss · Jan 27, 2025 · a9a0d8c · a9a0d8c
1 parent d44b216
commit a9a0d8c
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 0 deletions.
diff --git a/src/too/carton.py b/src/too/carton.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+import pathlib
 import warnings
 
 from typing import TYPE_CHECKING
@@ -43,6 +44,51 @@ def run_too_carton():
     too_carton.load(mode="append")
 
 
+def backup_sdss_id_tables(
+    database: PeeweeDatabaseConnection,
+    tables=[
+        "sdss_id_flat",
+        "sdss_id_flat_addendum",
+        "sdss_id_stacked",
+        "sdss_id_stacked_addendum",
+    ],
+    schema="sandbox",
+    outdir: pathlib.Path | str = ".",
+    suffix: str = "",
+):
+    """Backs up the SDSS ID tables.
+
+    Parameters
+    ----------
+    database
+        The database connection.
+    tables
+        The tables to backup. Each table is backed up as a separate file in
+        ``outdir`` with the format ``<schema>_<table_name>_<suffix>.csv``.
+    schema
+        The schema where the tables are located.
+    outdir
+        The output directory for the backup files. Defaults to the current
+        directory.
+    suffix
+        A suffix to add to the backup files.
+
+    """
+
+    outdir = pathlib.Path(outdir).absolute()
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    if suffix:
+        suffix = f"_{suffix}"
+
+    assert database.connected, "Database connection must be established."
+
+    for table in tables:
+        cursor = database.cursor()
+        with open(outdir / f"{schema}_{table}{suffix}.csv", "w") as file:
+            cursor.copy_expert(f"COPY {schema}.{table} TO STDOUT WITH CSV HEADER", file)
+
+
 def update_sdss_id_tables(database: PeeweeDatabaseConnection):
     """Updates the SDSS ID tables."""
 

diff --git a/src/too/tools.py b/src/too/tools.py
@@ -23,6 +23,7 @@
 
 from sdsstools import get_sjd
 
+from too import log
 from too.datamodel import too_dtypes
 from too.exceptions import ValidationError
 
@@ -81,6 +82,57 @@ def read_too_file(
     return targets
 
 
+def read_all_files(
+    path: pathlib.Path | str,
+    ignore_invalid: bool = False,
+    sort: bool = True,
+    silent: bool = False,
+) -> polars.DataFrame:
+    """Reads all ToO files in a directory.
+
+    Parameters
+    ----------
+    path
+        The path to the directory containing the files. All CSV and Parquet files
+        will be read.
+    ignore_invalid
+        If ``True``, ignores files that cannot be read.
+    sort
+        If ``True``, sorts the resulting dataframe by ``added_on`` and ``too_id``.
+    silent
+        If ``True``, does not print any output to the console log.
+
+    """
+
+    path = pathlib.Path(path)
+
+    process_files: list[pathlib.Path] = []
+    process_files.extend(path.glob("*.csv"))
+    process_files.extend(path.glob("*.parquet"))
+
+    targets = polars.DataFrame({}, schema=too_dtypes)
+    for file in process_files:
+        try:
+            new_targets = read_too_file(file, cast=True)
+
+        except Exception as ee:
+            if not silent:
+                log.error(f"Failed to read file {file}: {ee}")
+
+            if ignore_invalid:
+                continue
+            else:
+                raise
+
+        else:
+            targets = targets.vstack(new_targets)
+
+    if sort:
+        targets = targets.sort(["added_on", "too_id"])
+
+    return targets
+
+
 def deduplicate_too_targets(targets: polars.DataFrame) -> polars.DataFrame:
     """Deduplicates a list of ToO targets preferring the latest ``added_on`` value."""