diff --git a/src/hipscat/io/file_io/file_io.py b/src/hipscat/io/file_io/file_io.py index 8a7dcda1..9f7f9761 100644 --- a/src/hipscat/io/file_io/file_io.py +++ b/src/hipscat/io/file_io/file_io.py @@ -37,6 +37,19 @@ def make_directory( file_system.makedirs(file_pointer, exist_ok=exist_ok) +def unnest_headers_for_pandas(storage_options: Union[Dict[Any, Any], None]) -> Union[Dict[Any, Any], None]: + """Handle storage options for pandas read/write methods. + This is needed because fsspec http storage options are nested under the "headers" key, + see https://github.com/astronomy-commons/hipscat/issues/295 + """ + if storage_options is not None and "headers" in storage_options: + # Copy the storage options to avoid modifying the original dictionary + storage_options_copy = storage_options.copy() + headers = storage_options_copy.pop("headers") + return {**storage_options_copy, **headers} + return storage_options + + def remove_directory( file_pointer: FilePointer, ignore_errors=False, storage_options: Union[Dict[Any, Any], None] = None ): @@ -130,7 +143,9 @@ def load_csv_to_pandas( Returns: pandas dataframe loaded from CSV """ - return pd.read_csv(file_pointer, storage_options=storage_options, **kwargs) + + pd_storage_option = unnest_headers_for_pandas(storage_options) + return pd.read_csv(file_pointer, storage_options=pd_storage_option, **kwargs) def load_parquet_to_pandas( @@ -145,7 +160,8 @@ def load_parquet_to_pandas( Returns: pandas dataframe loaded from parquet """ - return pd.read_parquet(file_pointer, storage_options=storage_options, **kwargs) + pd_storage_option = unnest_headers_for_pandas(storage_options) + return pd.read_parquet(file_pointer, storage_options=pd_storage_option, **kwargs) def write_dataframe_to_csv( @@ -353,4 +369,5 @@ def read_parquet_file_to_pandas( Returns: Pandas DataFrame with the data from the parquet file """ - return pd.read_parquet(file_pointer, storage_options=storage_options, **kwargs) + pd_storage_option = unnest_headers_for_pandas(storage_options) + return pd.read_parquet(file_pointer, storage_options=pd_storage_option, **kwargs) diff --git a/tests/hipscat/io/file_io/test_file_io.py b/tests/hipscat/io/file_io/test_file_io.py index b75bf6bc..362ba95d 100644 --- a/tests/hipscat/io/file_io/test_file_io.py +++ b/tests/hipscat/io/file_io/test_file_io.py @@ -16,6 +16,7 @@ write_dataframe_to_csv, write_string_to_file, ) +from hipscat.io.file_io.file_io import unnest_headers_for_pandas from hipscat.io.file_io.file_pointer import does_file_or_directory_exist from hipscat.io.paths import pixel_catalog_file @@ -28,6 +29,30 @@ def test_make_directory(tmp_path): assert does_file_or_directory_exist(test_dir_path) +def test_unnest_headers_for_pandas(): + storage_options = { + "headers": {"Authorization": "Bearer my_token"}, + } + storage_options_str = {"Authorization": "Bearer my_token"} + assert storage_options_str == unnest_headers_for_pandas(storage_options) + + storage_options = { + "key1": "value1", + "headers": {"Authorization": "Bearer my_token", "Content": "X"}, + } + storage_options_str = {"key1": "value1", "Authorization": "Bearer my_token", "Content": "X"} + assert storage_options_str == unnest_headers_for_pandas(storage_options) + + storage_options = { + "key1": "value1", + "key2": None, + } + storage_options_str = {"key1": "value1", "key2": None} + assert storage_options_str == unnest_headers_for_pandas(storage_options) + + assert None is unnest_headers_for_pandas(None) + + def test_make_existing_directory_raises(tmp_path): test_dir_path = tmp_path / "test_path" make_directory(test_dir_path)