Skip to content

Commit

Permalink
Merge pull request #296 from astronomy-commons/pandas_storage_options
Browse files Browse the repository at this point in the history
handle_pandas_storage_options
  • Loading branch information
Schwarzam authored Jul 12, 2024
2 parents 97643b6 + 22cdc81 commit c081ede
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 3 deletions.
23 changes: 20 additions & 3 deletions src/hipscat/io/file_io/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,19 @@ def make_directory(
file_system.makedirs(file_pointer, exist_ok=exist_ok)


def unnest_headers_for_pandas(storage_options: Union[Dict[Any, Any], None]) -> Union[Dict[Any, Any], None]:
"""Handle storage options for pandas read/write methods.
This is needed because fsspec http storage options are nested under the "headers" key,
see https://github.com/astronomy-commons/hipscat/issues/295
"""
if storage_options is not None and "headers" in storage_options:
# Copy the storage options to avoid modifying the original dictionary
storage_options_copy = storage_options.copy()
headers = storage_options_copy.pop("headers")
return {**storage_options_copy, **headers}
return storage_options


def remove_directory(
file_pointer: FilePointer, ignore_errors=False, storage_options: Union[Dict[Any, Any], None] = None
):
Expand Down Expand Up @@ -130,7 +143,9 @@ def load_csv_to_pandas(
Returns:
pandas dataframe loaded from CSV
"""
return pd.read_csv(file_pointer, storage_options=storage_options, **kwargs)

pd_storage_option = unnest_headers_for_pandas(storage_options)
return pd.read_csv(file_pointer, storage_options=pd_storage_option, **kwargs)


def load_parquet_to_pandas(
Expand All @@ -145,7 +160,8 @@ def load_parquet_to_pandas(
Returns:
pandas dataframe loaded from parquet
"""
return pd.read_parquet(file_pointer, storage_options=storage_options, **kwargs)
pd_storage_option = unnest_headers_for_pandas(storage_options)
return pd.read_parquet(file_pointer, storage_options=pd_storage_option, **kwargs)


def write_dataframe_to_csv(
Expand Down Expand Up @@ -353,4 +369,5 @@ def read_parquet_file_to_pandas(
Returns:
Pandas DataFrame with the data from the parquet file
"""
return pd.read_parquet(file_pointer, storage_options=storage_options, **kwargs)
pd_storage_option = unnest_headers_for_pandas(storage_options)
return pd.read_parquet(file_pointer, storage_options=pd_storage_option, **kwargs)
25 changes: 25 additions & 0 deletions tests/hipscat/io/file_io/test_file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
write_dataframe_to_csv,
write_string_to_file,
)
from hipscat.io.file_io.file_io import unnest_headers_for_pandas
from hipscat.io.file_io.file_pointer import does_file_or_directory_exist
from hipscat.io.paths import pixel_catalog_file

Expand All @@ -28,6 +29,30 @@ def test_make_directory(tmp_path):
assert does_file_or_directory_exist(test_dir_path)


def test_unnest_headers_for_pandas():
storage_options = {
"headers": {"Authorization": "Bearer my_token"},
}
storage_options_str = {"Authorization": "Bearer my_token"}
assert storage_options_str == unnest_headers_for_pandas(storage_options)

storage_options = {
"key1": "value1",
"headers": {"Authorization": "Bearer my_token", "Content": "X"},
}
storage_options_str = {"key1": "value1", "Authorization": "Bearer my_token", "Content": "X"}
assert storage_options_str == unnest_headers_for_pandas(storage_options)

storage_options = {
"key1": "value1",
"key2": None,
}
storage_options_str = {"key1": "value1", "key2": None}
assert storage_options_str == unnest_headers_for_pandas(storage_options)

assert None is unnest_headers_for_pandas(None)


def test_make_existing_directory_raises(tmp_path):
test_dir_path = tmp_path / "test_path"
make_directory(test_dir_path)
Expand Down

0 comments on commit c081ede

Please sign in to comment.