Skip to content

Commit

Permalink
fix: respect provided column ordering in use_columns when loading a s…
Browse files Browse the repository at this point in the history
…heet eagerly (#262)

Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke authored Jul 22, 2024
1 parent b5dd16b commit e904f91
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 94 deletions.
62 changes: 62 additions & 0 deletions python/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

from datetime import datetime
from typing import Any

import pytest


@pytest.fixture
def expected_data_sheet_null_strings() -> dict[str, list[Any]]:
return {
"FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
"SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"],
"DATES_AND_NULLS": [
None,
None,
None,
datetime(2022, 12, 19, 0, 0),
datetime(2022, 8, 26, 0, 0),
datetime(2023, 5, 6, 0, 0),
datetime(2023, 3, 20, 0, 0),
datetime(2022, 8, 29, 0, 0),
None,
None,
],
"TIMESTAMPS_AND_NULLS": [
None,
None,
datetime(2023, 2, 18, 6, 13, 56, 730000),
datetime(2022, 9, 20, 20, 0, 7, 50000),
datetime(2022, 9, 24, 17, 4, 31, 236000),
None,
None,
None,
datetime(2022, 9, 14, 1, 50, 58, 390000),
datetime(2022, 10, 21, 17, 20, 12, 223000),
],
"INTS_AND_NULLS": [
2076.0,
2285.0,
39323.0,
None,
None,
None,
11953.0,
None,
30192.0,
None,
],
"FLOATS_AND_NULLS": [
141.02023312814603,
778.0655928608671,
None,
497.60307287584106,
627.446112513911,
None,
None,
None,
488.3509486743364,
None,
],
}
57 changes: 57 additions & 0 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,13 +481,70 @@ def test_use_columns_with_bad_callable() -> None:

def test_use_columns_with_eager_loading() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
expected_months = [1.0, 2.0]
expected_years = [2019.0, 2020.0]

# default
rb = excel_reader.load_sheet_eager(0)
assert rb.schema.names == ["Month", "Year"]
assert rb["Year"].tolist() == expected_years
assert rb["Month"].tolist() == expected_months

# changing order
rb = excel_reader.load_sheet_eager(0, use_columns=["Year", "Month"])
assert rb.schema.names == ["Year", "Month"]
assert rb["Year"].tolist() == expected_years
assert rb["Month"].tolist() == expected_months

# subset
rb = excel_reader.load_sheet_eager(0, use_columns=["Year"])
assert rb.schema.names == ["Year"]
assert rb["Year"].tolist() == expected_years
assert "Month" not in (field.name for field in rb.schema)


@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
def test_use_columns_dtypes_eager_loading(
excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]
) -> None:
expected_pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns(
pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
)
expected_pd_df = pd.DataFrame(expected_data_sheet_null_strings)
expected_pd_df["DATES_AND_NULLS"] = expected_pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
expected_pd_df["TIMESTAMPS_AND_NULLS"] = expected_pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")

for use_columns in (
list(expected_data_sheet_null_strings.keys()),
[key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2],
[key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2 == 0],
list(reversed(expected_data_sheet_null_strings.keys())),
[
key
for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys()))
if idx % 2
],
[
key
for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys()))
if idx % 2 == 0
],
):
excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
sheet = excel_reader.load_sheet_eager(0, use_columns=use_columns)
pd_df = sheet.to_pandas()
pl_df = pl.from_arrow(data=sheet)
assert isinstance(pl_df, pl.DataFrame)
sheet_lazy = excel_reader.load_sheet(0, use_columns=use_columns)
pl_df_lazy = sheet_lazy.to_polars()
pd_df_lazy = sheet_lazy.to_pandas()

pl_assert_frame_equal(pl_df_lazy, pl_df)
pd_assert_frame_equal(pd_df_lazy, pd_df)

pl_assert_frame_equal(expected_pl_df.select(use_columns), pl_df)
pd_assert_frame_equal(expected_pd_df[use_columns], pd_df)

assert pd_df.columns.to_list() == use_columns
assert pl_df.columns == use_columns
62 changes: 6 additions & 56 deletions python/tests/test_fastexcel.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from __future__ import annotations

from datetime import datetime
from typing import Any

import fastexcel
import pandas as pd
Expand Down Expand Up @@ -492,72 +495,19 @@ def test_sheet_with_ref():


@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
def test_null_strings(excel_file: str):
def test_null_strings(excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]):
excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
sheet = excel_reader.load_sheet(0)

assert sheet.height == sheet.total_height == 10
assert sheet.width == 6

expected = {
"FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
"SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"],
"DATES_AND_NULLS": [
None,
None,
None,
datetime(2022, 12, 19, 0, 0),
datetime(2022, 8, 26, 0, 0),
datetime(2023, 5, 6, 0, 0),
datetime(2023, 3, 20, 0, 0),
datetime(2022, 8, 29, 0, 0),
None,
None,
],
"TIMESTAMPS_AND_NULLS": [
None,
None,
datetime(2023, 2, 18, 6, 13, 56, 730000),
datetime(2022, 9, 20, 20, 0, 7, 50000),
datetime(2022, 9, 24, 17, 4, 31, 236000),
None,
None,
None,
datetime(2022, 9, 14, 1, 50, 58, 390000),
datetime(2022, 10, 21, 17, 20, 12, 223000),
],
"INTS_AND_NULLS": [
2076.0,
2285.0,
39323.0,
None,
None,
None,
11953.0,
None,
30192.0,
None,
],
"FLOATS_AND_NULLS": [
141.02023312814603,
778.0655928608671,
None,
497.60307287584106,
627.446112513911,
None,
None,
None,
488.3509486743364,
None,
],
}

pd_df = pd.DataFrame(expected)
pd_df = pd.DataFrame(expected_data_sheet_null_strings)
pd_df["DATES_AND_NULLS"] = pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
pd_df["TIMESTAMPS_AND_NULLS"] = pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")
pd_assert_frame_equal(sheet.to_pandas(), pd_df)

pl_df = pl.DataFrame(expected).with_columns(
pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns(
pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
)
Expand Down
17 changes: 3 additions & 14 deletions src/types/python/excelreader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@ use std::{
io::{BufReader, Cursor},
};

use arrow::{
datatypes::{Field, Schema},
pyarrow::ToPyArrow,
record_batch::RecordBatch,
};
use arrow::{pyarrow::ToPyArrow, record_batch::RecordBatch};
use calamine::{
open_workbook_auto, open_workbook_auto_from_rs, Data, DataRef, Range, Reader, Sheets,
};
Expand All @@ -25,7 +21,7 @@ use crate::{

use crate::utils::schema::get_schema_sample_rows;

use super::excelsheet::record_batch_from_data_and_schema;
use super::excelsheet::record_batch_from_data_and_columns;
use super::excelsheet::{
column_info::{build_available_columns, build_available_columns_info},
sheet_data::ExcelSheetData,
Expand Down Expand Up @@ -138,14 +134,7 @@ impl ExcelReader {

let final_columns = selected_columns.select_columns(&available_columns)?;

let fields = final_columns
.iter()
.map(Into::<Field>::into)
.collect::<Vec<_>>();

let schema = Schema::new(fields);

record_batch_from_data_and_schema(schema, data, offset, limit)
record_batch_from_data_and_columns(final_columns, data, offset, limit)
}

#[allow(clippy::too_many_arguments)]
Expand Down
2 changes: 1 addition & 1 deletion src/types/python/excelsheet/column_info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ impl FromStr for DTypeFrom {
pub(crate) struct ColumnInfo {
/// `str`. The name of the column
#[pyo3(get)]
name: String,
pub name: String,
/// `int`. The index of the column
#[pyo3(get)]
index: usize,
Expand Down
45 changes: 22 additions & 23 deletions src/types/python/excelsheet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::{cmp, collections::HashSet, fmt::Debug, str::FromStr, sync::Arc};

use arrow::{
array::NullArray,
datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit},
datatypes::{Field, Schema},
pyarrow::ToPyArrow,
record_batch::RecordBatch,
};
Expand Down Expand Up @@ -433,33 +433,32 @@ impl From<&ExcelSheet> for Schema {
}
}

pub(crate) fn record_batch_from_data_and_schema(
schema: Schema,
pub(crate) fn record_batch_from_data_and_columns(
columns: Vec<ColumnInfo>,
data: &ExcelSheetData,
offset: usize,
limit: usize,
) -> FastExcelResult<RecordBatch> {
let mut iter = schema
.fields()
.iter()
.enumerate()
.map(|(col_idx, field)| {
let fields = columns.iter().map(Into::<Field>::into).collect::<Vec<_>>();

let schema = Schema::new(fields);

let mut iter = columns
.into_iter()
.map(|column_info| {
let col_idx = column_info.index();
let dtype = *column_info.dtype();
(
field.name(),
match field.data_type() {
ArrowDataType::Boolean => create_boolean_array(data, col_idx, offset, limit),
ArrowDataType::Int64 => create_int_array(data, col_idx, offset, limit),
ArrowDataType::Float64 => create_float_array(data, col_idx, offset, limit),
ArrowDataType::Utf8 => create_string_array(data, col_idx, offset, limit),
ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => {
create_datetime_array(data, col_idx, offset, limit)
}
ArrowDataType::Date32 => create_date_array(data, col_idx, offset, limit),
ArrowDataType::Duration(TimeUnit::Millisecond) => {
create_duration_array(data, col_idx, offset, limit)
}
ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)),
_ => unreachable!(),
column_info.name,
match dtype {
DType::Null => Arc::new(NullArray::new(limit - offset)),
DType::Int => create_int_array(data, col_idx, offset, limit),
DType::Float => create_float_array(data, col_idx, offset, limit),
DType::String => create_string_array(data, col_idx, offset, limit),
DType::Bool => create_boolean_array(data, col_idx, offset, limit),
DType::DateTime => create_datetime_array(data, col_idx, offset, limit),
DType::Date => create_date_array(data, col_idx, offset, limit),
DType::Duration => create_duration_array(data, col_idx, offset, limit),
},
)
})
Expand Down

0 comments on commit e904f91

Please sign in to comment.