Merge pull request #296 from astronomy-commons/pandas_storage_options

handle_pandas_storage_options
astronomy-commons · Jul 12, 2024 · c081ede · c081ede
2 parents 97643b6 + 22cdc81
commit c081ede
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 3 deletions.
diff --git a/src/hipscat/io/file_io/file_io.py b/src/hipscat/io/file_io/file_io.py
@@ -37,6 +37,19 @@ def make_directory(
     file_system.makedirs(file_pointer, exist_ok=exist_ok)
 
 
+def unnest_headers_for_pandas(storage_options: Union[Dict[Any, Any], None]) -> Union[Dict[Any, Any], None]:
+    """Handle storage options for pandas read/write methods.
+    This is needed because fsspec http storage options are nested under the "headers" key,
+    see https://github.com/astronomy-commons/hipscat/issues/295
+    """
+    if storage_options is not None and "headers" in storage_options:
+        # Copy the storage options to avoid modifying the original dictionary
+        storage_options_copy = storage_options.copy()
+        headers = storage_options_copy.pop("headers")
+        return {**storage_options_copy, **headers}
+    return storage_options
+
+
 def remove_directory(
     file_pointer: FilePointer, ignore_errors=False, storage_options: Union[Dict[Any, Any], None] = None
 ):
@@ -130,7 +143,9 @@ def load_csv_to_pandas(
     Returns:
         pandas dataframe loaded from CSV
     """
-    return pd.read_csv(file_pointer, storage_options=storage_options, **kwargs)
+
+    pd_storage_option = unnest_headers_for_pandas(storage_options)
+    return pd.read_csv(file_pointer, storage_options=pd_storage_option, **kwargs)
 
 
 def load_parquet_to_pandas(
@@ -145,7 +160,8 @@ def load_parquet_to_pandas(
     Returns:
         pandas dataframe loaded from parquet
     """
-    return pd.read_parquet(file_pointer, storage_options=storage_options, **kwargs)
+    pd_storage_option = unnest_headers_for_pandas(storage_options)
+    return pd.read_parquet(file_pointer, storage_options=pd_storage_option, **kwargs)
 
 
 def write_dataframe_to_csv(
@@ -353,4 +369,5 @@ def read_parquet_file_to_pandas(
     Returns:
         Pandas DataFrame with the data from the parquet file
     """
-    return pd.read_parquet(file_pointer, storage_options=storage_options, **kwargs)
+    pd_storage_option = unnest_headers_for_pandas(storage_options)
+    return pd.read_parquet(file_pointer, storage_options=pd_storage_option, **kwargs)
diff --git a/tests/hipscat/io/file_io/test_file_io.py b/tests/hipscat/io/file_io/test_file_io.py
@@ -16,6 +16,7 @@
     write_dataframe_to_csv,
     write_string_to_file,
 )
+from hipscat.io.file_io.file_io import unnest_headers_for_pandas
 from hipscat.io.file_io.file_pointer import does_file_or_directory_exist
 from hipscat.io.paths import pixel_catalog_file
 
@@ -28,6 +29,30 @@ def test_make_directory(tmp_path):
     assert does_file_or_directory_exist(test_dir_path)
 
 
+def test_unnest_headers_for_pandas():
+    storage_options = {
+        "headers": {"Authorization": "Bearer my_token"},
+    }
+    storage_options_str = {"Authorization": "Bearer my_token"}
+    assert storage_options_str == unnest_headers_for_pandas(storage_options)
+
+    storage_options = {
+        "key1": "value1",
+        "headers": {"Authorization": "Bearer my_token", "Content": "X"},
+    }
+    storage_options_str = {"key1": "value1", "Authorization": "Bearer my_token", "Content": "X"}
+    assert storage_options_str == unnest_headers_for_pandas(storage_options)
+
+    storage_options = {
+        "key1": "value1",
+        "key2": None,
+    }
+    storage_options_str = {"key1": "value1", "key2": None}
+    assert storage_options_str == unnest_headers_for_pandas(storage_options)
+
+    assert None is unnest_headers_for_pandas(None)
+
+
 def test_make_existing_directory_raises(tmp_path):
     test_dir_path = tmp_path / "test_path"
     make_directory(test_dir_path)