fix: respect provided column ordering in use_columns when loading a s…

…heet eagerly (#262) Signed-off-by: Luka Peschke <[email protected]>
ToucanToco · Jul 22, 2024 · e904f91 · e904f91
1 parent b5dd16b
commit e904f91
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 94 deletions.
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any
+
+import pytest
+
+
+@pytest.fixture
+def expected_data_sheet_null_strings() -> dict[str, list[Any]]:
+    return {
+        "FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        "SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"],
+        "DATES_AND_NULLS": [
+            None,
+            None,
+            None,
+            datetime(2022, 12, 19, 0, 0),
+            datetime(2022, 8, 26, 0, 0),
+            datetime(2023, 5, 6, 0, 0),
+            datetime(2023, 3, 20, 0, 0),
+            datetime(2022, 8, 29, 0, 0),
+            None,
+            None,
+        ],
+        "TIMESTAMPS_AND_NULLS": [
+            None,
+            None,
+            datetime(2023, 2, 18, 6, 13, 56, 730000),
+            datetime(2022, 9, 20, 20, 0, 7, 50000),
+            datetime(2022, 9, 24, 17, 4, 31, 236000),
+            None,
+            None,
+            None,
+            datetime(2022, 9, 14, 1, 50, 58, 390000),
+            datetime(2022, 10, 21, 17, 20, 12, 223000),
+        ],
+        "INTS_AND_NULLS": [
+            2076.0,
+            2285.0,
+            39323.0,
+            None,
+            None,
+            None,
+            11953.0,
+            None,
+            30192.0,
+            None,
+        ],
+        "FLOATS_AND_NULLS": [
+            141.02023312814603,
+            778.0655928608671,
+            None,
+            497.60307287584106,
+            627.446112513911,
+            None,
+            None,
+            None,
+            488.3509486743364,
+            None,
+        ],
+    }
diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py
@@ -481,13 +481,70 @@ def test_use_columns_with_bad_callable() -> None:
 
 def test_use_columns_with_eager_loading() -> None:
     excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
+    expected_months = [1.0, 2.0]
+    expected_years = [2019.0, 2020.0]
 
     # default
     rb = excel_reader.load_sheet_eager(0)
     assert rb.schema.names == ["Month", "Year"]
+    assert rb["Year"].tolist() == expected_years
+    assert rb["Month"].tolist() == expected_months
+
     # changing order
     rb = excel_reader.load_sheet_eager(0, use_columns=["Year", "Month"])
     assert rb.schema.names == ["Year", "Month"]
+    assert rb["Year"].tolist() == expected_years
+    assert rb["Month"].tolist() == expected_months
+
     # subset
     rb = excel_reader.load_sheet_eager(0, use_columns=["Year"])
     assert rb.schema.names == ["Year"]
+    assert rb["Year"].tolist() == expected_years
+    assert "Month" not in (field.name for field in rb.schema)
+
+
+@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
+def test_use_columns_dtypes_eager_loading(
+    excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]
+) -> None:
+    expected_pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns(
+        pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
+        pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
+    )
+    expected_pd_df = pd.DataFrame(expected_data_sheet_null_strings)
+    expected_pd_df["DATES_AND_NULLS"] = expected_pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
+    expected_pd_df["TIMESTAMPS_AND_NULLS"] = expected_pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")
+
+    for use_columns in (
+        list(expected_data_sheet_null_strings.keys()),
+        [key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2],
+        [key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2 == 0],
+        list(reversed(expected_data_sheet_null_strings.keys())),
+        [
+            key
+            for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys()))
+            if idx % 2
+        ],
+        [
+            key
+            for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys()))
+            if idx % 2 == 0
+        ],
+    ):
+        excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
+        sheet = excel_reader.load_sheet_eager(0, use_columns=use_columns)
+        pd_df = sheet.to_pandas()
+        pl_df = pl.from_arrow(data=sheet)
+        assert isinstance(pl_df, pl.DataFrame)
+        sheet_lazy = excel_reader.load_sheet(0, use_columns=use_columns)
+        pl_df_lazy = sheet_lazy.to_polars()
+        pd_df_lazy = sheet_lazy.to_pandas()
+
+        pl_assert_frame_equal(pl_df_lazy, pl_df)
+        pd_assert_frame_equal(pd_df_lazy, pd_df)
+
+        pl_assert_frame_equal(expected_pl_df.select(use_columns), pl_df)
+        pd_assert_frame_equal(expected_pd_df[use_columns], pd_df)
+
+        assert pd_df.columns.to_list() == use_columns
+        assert pl_df.columns == use_columns
diff --git a/python/tests/test_fastexcel.py b/python/tests/test_fastexcel.py
@@ -1,4 +1,7 @@
+from __future__ import annotations
+
 from datetime import datetime
+from typing import Any
 
 import fastexcel
 import pandas as pd
@@ -492,72 +495,19 @@ def test_sheet_with_ref():
 
 
 @pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
-def test_null_strings(excel_file: str):
+def test_null_strings(excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]):
     excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
     sheet = excel_reader.load_sheet(0)
 
     assert sheet.height == sheet.total_height == 10
     assert sheet.width == 6
 
-    expected = {
-        "FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
-        "SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"],
-        "DATES_AND_NULLS": [
-            None,
-            None,
-            None,
-            datetime(2022, 12, 19, 0, 0),
-            datetime(2022, 8, 26, 0, 0),
-            datetime(2023, 5, 6, 0, 0),
-            datetime(2023, 3, 20, 0, 0),
-            datetime(2022, 8, 29, 0, 0),
-            None,
-            None,
-        ],
-        "TIMESTAMPS_AND_NULLS": [
-            None,
-            None,
-            datetime(2023, 2, 18, 6, 13, 56, 730000),
-            datetime(2022, 9, 20, 20, 0, 7, 50000),
-            datetime(2022, 9, 24, 17, 4, 31, 236000),
-            None,
-            None,
-            None,
-            datetime(2022, 9, 14, 1, 50, 58, 390000),
-            datetime(2022, 10, 21, 17, 20, 12, 223000),
-        ],
-        "INTS_AND_NULLS": [
-            2076.0,
-            2285.0,
-            39323.0,
-            None,
-            None,
-            None,
-            11953.0,
-            None,
-            30192.0,
-            None,
-        ],
-        "FLOATS_AND_NULLS": [
-            141.02023312814603,
-            778.0655928608671,
-            None,
-            497.60307287584106,
-            627.446112513911,
-            None,
-            None,
-            None,
-            488.3509486743364,
-            None,
-        ],
-    }
-
-    pd_df = pd.DataFrame(expected)
+    pd_df = pd.DataFrame(expected_data_sheet_null_strings)
     pd_df["DATES_AND_NULLS"] = pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
     pd_df["TIMESTAMPS_AND_NULLS"] = pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")
     pd_assert_frame_equal(sheet.to_pandas(), pd_df)
 
-    pl_df = pl.DataFrame(expected).with_columns(
+    pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns(
         pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
         pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
     )

diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs
@@ -3,11 +3,7 @@ use std::{
     io::{BufReader, Cursor},
 };
 
-use arrow::{
-    datatypes::{Field, Schema},
-    pyarrow::ToPyArrow,
-    record_batch::RecordBatch,
-};
+use arrow::{pyarrow::ToPyArrow, record_batch::RecordBatch};
 use calamine::{
     open_workbook_auto, open_workbook_auto_from_rs, Data, DataRef, Range, Reader, Sheets,
 };
@@ -25,7 +21,7 @@ use crate::{
 
 use crate::utils::schema::get_schema_sample_rows;
 
-use super::excelsheet::record_batch_from_data_and_schema;
+use super::excelsheet::record_batch_from_data_and_columns;
 use super::excelsheet::{
     column_info::{build_available_columns, build_available_columns_info},
     sheet_data::ExcelSheetData,
@@ -138,14 +134,7 @@ impl ExcelReader {
 
         let final_columns = selected_columns.select_columns(&available_columns)?;
 
-        let fields = final_columns
-            .iter()
-            .map(Into::<Field>::into)
-            .collect::<Vec<_>>();
-
-        let schema = Schema::new(fields);
-
-        record_batch_from_data_and_schema(schema, data, offset, limit)
+        record_batch_from_data_and_columns(final_columns, data, offset, limit)
     }
 
     #[allow(clippy::too_many_arguments)]

diff --git a/src/types/python/excelsheet/column_info.rs b/src/types/python/excelsheet/column_info.rs
@@ -88,7 +88,7 @@ impl FromStr for DTypeFrom {
 pub(crate) struct ColumnInfo {
     /// `str`. The name of the column
     #[pyo3(get)]
-    name: String,
+    pub name: String,
     /// `int`. The index of the column
     #[pyo3(get)]
     index: usize,

diff --git a/src/types/python/excelsheet/mod.rs b/src/types/python/excelsheet/mod.rs
@@ -7,7 +7,7 @@ use std::{cmp, collections::HashSet, fmt::Debug, str::FromStr, sync::Arc};
 
 use arrow::{
     array::NullArray,
-    datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit},
+    datatypes::{Field, Schema},
     pyarrow::ToPyArrow,
     record_batch::RecordBatch,
 };
@@ -433,33 +433,32 @@ impl From<&ExcelSheet> for Schema {
     }
 }
 
-pub(crate) fn record_batch_from_data_and_schema(
-    schema: Schema,
+pub(crate) fn record_batch_from_data_and_columns(
+    columns: Vec<ColumnInfo>,
     data: &ExcelSheetData,
     offset: usize,
     limit: usize,
 ) -> FastExcelResult<RecordBatch> {
-    let mut iter = schema
-        .fields()
-        .iter()
-        .enumerate()
-        .map(|(col_idx, field)| {
+    let fields = columns.iter().map(Into::<Field>::into).collect::<Vec<_>>();
+
+    let schema = Schema::new(fields);
+
+    let mut iter = columns
+        .into_iter()
+        .map(|column_info| {
+            let col_idx = column_info.index();
+            let dtype = *column_info.dtype();
             (
-                field.name(),
-                match field.data_type() {
-                    ArrowDataType::Boolean => create_boolean_array(data, col_idx, offset, limit),
-                    ArrowDataType::Int64 => create_int_array(data, col_idx, offset, limit),
-                    ArrowDataType::Float64 => create_float_array(data, col_idx, offset, limit),
-                    ArrowDataType::Utf8 => create_string_array(data, col_idx, offset, limit),
-                    ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => {
-                        create_datetime_array(data, col_idx, offset, limit)
-                    }
-                    ArrowDataType::Date32 => create_date_array(data, col_idx, offset, limit),
-                    ArrowDataType::Duration(TimeUnit::Millisecond) => {
-                        create_duration_array(data, col_idx, offset, limit)
-                    }
-                    ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)),
-                    _ => unreachable!(),
+                column_info.name,
+                match dtype {
+                    DType::Null => Arc::new(NullArray::new(limit - offset)),
+                    DType::Int => create_int_array(data, col_idx, offset, limit),
+                    DType::Float => create_float_array(data, col_idx, offset, limit),
+                    DType::String => create_string_array(data, col_idx, offset, limit),
+                    DType::Bool => create_boolean_array(data, col_idx, offset, limit),
+                    DType::DateTime => create_datetime_array(data, col_idx, offset, limit),
+                    DType::Date => create_date_array(data, col_idx, offset, limit),
+                    DType::Duration => create_duration_array(data, col_idx, offset, limit),
                 },
             )
         })