Skip to content

Commit

Permalink
apacheGH-39855: [Python] ListView support for pa.array() (apache#40160)
Browse files Browse the repository at this point in the history
### Rationale for this change

Add pa.array() instantiation support for ListView and LargeListView formats.

### What changes are included in this PR?

* pa.array() supports creating ListView and LargeListView types
* ListArray, LargeListArray now have their size initialized before adding elements during python-to-arrow conversion. This allows these types to be convertible to ListViewArray and LargeListViewArray types.

### Are these changes tested?

Yes, unit tested.

### Are there any user-facing changes?

Yes, new feature added.
* Closes: apache#39855
* GitHub Issue: apache#39855

Authored-by: Dane Pitkin <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
danepitkin authored Mar 1, 2024
1 parent 2308e40 commit 7c4f4c2
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 43 deletions.
23 changes: 21 additions & 2 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,8 @@ struct PyConverterTrait<
};

template <typename T>
struct PyConverterTrait<T, enable_if_list_like<T>> {
struct PyConverterTrait<
T, enable_if_t<is_list_like_type<T>::value || is_list_view_type<T>::value>> {
using type = PyListConverter<T>;
};

Expand Down Expand Up @@ -803,7 +804,6 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
return this->list_builder_->AppendNull();
}

RETURN_NOT_OK(this->list_builder_->Append());
if (PyArray_Check(value)) {
RETURN_NOT_OK(AppendNdarray(value));
} else if (PySequence_Check(value)) {
Expand All @@ -824,6 +824,21 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
}

protected:
// MapType does not support args in the Append() method
Status AppendTo(const MapType*, int64_t size) { return this->list_builder_->Append(); }

// FixedSizeListType does not support args in the Append() method
Status AppendTo(const FixedSizeListType*, int64_t size) {
return this->list_builder_->Append();
}

// ListType requires the size argument in the Append() method
// in order to be convertible to a ListViewType. ListViewType
// requires the size argument in the Append() method always.
Status AppendTo(const BaseListType*, int64_t size) {
return this->list_builder_->Append(true, size);
}

Status ValidateBuilder(const MapType*) {
if (this->list_builder_->key_builder()->null_count() > 0) {
return Status::Invalid("Invalid Map: key field cannot contain null values");
Expand All @@ -836,11 +851,14 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {

Status AppendSequence(PyObject* value) {
int64_t size = static_cast<int64_t>(PySequence_Size(value));
RETURN_NOT_OK(AppendTo(this->list_type_, size));
RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
return this->value_converter_->Extend(value, size);
}

Status AppendIterable(PyObject* value) {
auto size = static_cast<int64_t>(PyObject_Size(value));
RETURN_NOT_OK(AppendTo(this->list_type_, size));
PyObject* iterator = PyObject_GetIter(value);
OwnedRef iter_ref(iterator);
while (PyObject* item = PyIter_Next(iterator)) {
Expand All @@ -857,6 +875,7 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
return Status::Invalid("Can only convert 1-dimensional array values");
}
const int64_t size = PyArray_SIZE(ndarray);
RETURN_NOT_OK(AppendTo(this->list_type_, size));
RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));

const auto value_type = this->value_converter_->builder()->type();
Expand Down
4 changes: 3 additions & 1 deletion python/pyarrow/tests/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ def list_types(item_strategy=primitive_types):
pa.list_,
item_strategy,
st.integers(min_value=0, max_value=16)
)
),
st.builds(pa.list_view, item_strategy),
st.builds(pa.large_list_view, item_strategy)
)


Expand Down
147 changes: 136 additions & 11 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,8 @@ def test_string_binary_from_buffers():
assert copied.null_count == 0


@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
@pytest.mark.parametrize('list_type_factory', [
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
def test_list_from_buffers(list_type_factory):
ty = list_type_factory(pa.int16())
array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)
Expand All @@ -637,15 +638,15 @@ def test_list_from_buffers(list_type_factory):

with pytest.raises(ValueError):
# No children
pa.Array.from_buffers(ty, 4, [None, buffers[1]])
pa.Array.from_buffers(ty, 4, buffers[:ty.num_buffers])

child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:])
copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child])
child = pa.Array.from_buffers(pa.int16(), 6, buffers[ty.num_buffers:])
copied = pa.Array.from_buffers(ty, 4, buffers[:ty.num_buffers], children=[child])
assert copied.equals(array)

with pytest.raises(ValueError):
# too many children
pa.Array.from_buffers(ty, 4, [None, buffers[1]],
pa.Array.from_buffers(ty, 4, buffers[:ty.num_buffers],
children=[child, child])


Expand Down Expand Up @@ -2022,6 +2023,9 @@ def test_cast_identities(ty, values):
([[1, 2], [3]], pa.list_(pa.int64())),
([[4, 5], [6]], pa.large_list(pa.int16())),
([['a'], None, ['b', 'c']], pa.list_(pa.string())),
([[1, 2], [3]], pa.list_view(pa.int64())),
([[4, 5], [6]], pa.large_list_view(pa.int16())),
([['a'], None, ['b', 'c']], pa.list_view(pa.string())),
([(1, 'a'), (2, 'c'), None],
pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
]
Expand Down Expand Up @@ -3575,9 +3579,10 @@ def test_run_end_encoded_from_buffers():
1, offset, children)


@pytest.mark.parametrize(('list_array_type'),
[pa.ListViewArray, pa.LargeListViewArray])
def test_list_view_from_arrays(list_array_type):
@pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
[(pa.ListViewArray, pa.list_view),
(pa.LargeListViewArray, pa.large_list_view)])
def test_list_view_from_arrays(list_array_type, list_type_factory):
# test in order offsets, similar to ListArray representation
values = [1, 2, 3, 4, 5, 6, None, 7]
offsets = [0, 2, 4, 6]
Expand All @@ -3589,6 +3594,17 @@ def test_list_view_from_arrays(list_array_type):
assert array.offsets.to_pylist() == offsets
assert array.sizes.to_pylist() == sizes

# with specified type
typ = list_type_factory(pa.field("name", pa.int64()))
result = list_array_type.from_arrays(offsets, sizes, values, typ)
assert result.type == typ
assert result.type.value_field.name == "name"

# with mismatching type
typ = list_type_factory(pa.binary())
with pytest.raises(TypeError):
list_array_type.from_arrays(offsets, sizes, values, type=typ)

# test out of order offsets with overlapping values
values = [1, 2, 3, 4]
offsets = [2, 1, 0]
Expand Down Expand Up @@ -3635,12 +3651,121 @@ def test_list_view_from_arrays(list_array_type):
assert array.sizes.to_pylist() == sizes


@pytest.mark.parametrize(('list_array_type'),
[pa.ListViewArray, pa.LargeListViewArray])
def test_list_view_flatten(list_array_type):
@pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
[(pa.ListViewArray, pa.list_view),
(pa.LargeListViewArray, pa.large_list_view)])
def test_list_view_from_arrays_fails(list_array_type, list_type_factory):
values = [1, 2]
offsets = [0, 1, None]
sizes = [1, 1, 0]
mask = pa.array([False, False, True])

# Ambiguous to specify both validity map and offsets or sizes with nulls
with pytest.raises(pa.lib.ArrowInvalid):
list_array_type.from_arrays(offsets, sizes, values, mask=mask)

offsets = [0, 1, 1]
array = list_array_type.from_arrays(offsets, sizes, values, mask=mask)
array_slice = array[1:]

# List offsets and sizes must not be slices if a validity map is specified
with pytest.raises(pa.lib.ArrowInvalid):
list_array_type.from_arrays(
array_slice.offsets, array_slice.sizes,
array_slice.values, mask=array_slice.is_null())


@pytest.mark.parametrize(('list_array_type', 'list_type_factory', 'offset_type'),
[(pa.ListViewArray, pa.list_view, pa.int32()),
(pa.LargeListViewArray, pa.large_list_view, pa.int64())])
def test_list_view_flatten(list_array_type, list_type_factory, offset_type):
arr0 = pa.array([
1, None, 2,
3, 4,
5, 6,
7, 8
], type=pa.int64())

typ1 = list_type_factory(pa.int64())
arr1 = pa.array([
[1, None, 2],
None,
[3, 4],
[],
[5, 6],
None,
[7, 8]
], type=typ1)
offsets1 = pa.array([0, 3, 3, 5, 5, 7, 7], type=offset_type)
sizes1 = pa.array([3, 0, 2, 0, 2, 0, 2], type=offset_type)

typ2 = list_type_factory(
list_type_factory(
pa.int64()
)
)
arr2 = pa.array([
None,
[
[1, None, 2],
None,
[3, 4]
],
[],
[
[],
[5, 6],
None
],
[
[7, 8]
]
], type=typ2)
offsets2 = pa.array([0, 0, 3, 3, 6], type=offset_type)
sizes2 = pa.array([0, 3, 0, 3, 1], type=offset_type)

assert arr1.flatten().equals(arr0)
assert arr1.offsets.equals(offsets1)
assert arr1.sizes.equals(sizes1)
assert arr1.values.equals(arr0)
assert arr2.flatten().equals(arr1)
assert arr2.offsets.equals(offsets2)
assert arr2.sizes.equals(sizes2)
assert arr2.values.equals(arr1)
assert arr2.flatten().flatten().equals(arr0)
assert arr2.values.values.equals(arr0)

# test out of order offsets
values = [1, 2, 3, 4]
offsets = [3, 2, 1, 0]
sizes = [1, 1, 1, 1]
array = list_array_type.from_arrays(offsets, sizes, values)

assert array.flatten().to_pylist() == [4, 3, 2, 1]

# test null elements backed by non-empty sublists
mask = pa.array([False, False, False, True])
array = list_array_type.from_arrays(offsets, sizes, values, mask=mask)

assert array.flatten().to_pylist() == [4, 3, 2]
assert array.values.to_pylist() == [1, 2, 3, 4]


@pytest.mark.parametrize('list_view_type', [pa.ListViewArray, pa.LargeListViewArray])
def test_list_view_slice(list_view_type):
# sliced -> values keeps referring to full values buffer, but offsets is
# sliced as well so the offsets correctly point into the full values array
# sliced -> flatten() will return the sliced value array.

array = list_view_type.from_arrays(offsets=[0, 3, 4], sizes=[
3, 1, 2], values=[1, 2, 3, 4, 5, 6])
sliced_array = array[1:]

assert sliced_array.values.to_pylist() == [1, 2, 3, 4, 5, 6]
assert sliced_array.offsets.to_pylist() == [3, 4]
assert sliced_array.flatten().to_pylist() == [4, 5, 6]

i = sliced_array.offsets[0].as_py()
j = sliced_array.offsets[1].as_py()

assert sliced_array[0].as_py() == sliced_array.values[i:j].to_pylist() == [4]
44 changes: 30 additions & 14 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,21 +252,17 @@ def test_nested_lists(seq):
assert arr.null_count == 1
assert arr.type == pa.list_(pa.int64())
assert arr.to_pylist() == data
# With explicit type
arr = pa.array(seq(data), type=pa.list_(pa.int32()))
assert len(arr) == 3
assert arr.null_count == 1
assert arr.type == pa.list_(pa.int32())
assert arr.to_pylist() == data


@parametrize_with_sequence_types
def test_nested_large_lists(seq):
@pytest.mark.parametrize("factory", [
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
def test_nested_lists_with_explicit_type(seq, factory):
data = [[], [1, 2], None]
arr = pa.array(seq(data), type=pa.large_list(pa.int16()))
arr = pa.array(seq(data), type=factory(pa.int16()))
assert len(arr) == 3
assert arr.null_count == 1
assert arr.type == pa.large_list(pa.int16())
assert arr.type == factory(pa.int16())
assert arr.to_pylist() == data


Expand All @@ -277,15 +273,22 @@ def test_list_with_non_list(seq):
pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
with pytest.raises(TypeError):
pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64()))
with pytest.raises(TypeError):
pa.array(seq([[], [1, 2], 3]), type=pa.list_view(pa.int64()))
with pytest.raises(TypeError):
pa.array(seq([[], [1, 2], 3]), type=pa.large_list_view(pa.int64()))


@parametrize_with_sequence_types
def test_nested_arrays(seq):
@pytest.mark.parametrize("factory", [
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
def test_nested_arrays(seq, factory):
arr = pa.array(seq([np.array([], dtype=np.int64),
np.array([1, 2], dtype=np.int64), None]))
np.array([1, 2], dtype=np.int64), None]),
type=factory(pa.int64()))
assert len(arr) == 3
assert arr.null_count == 1
assert arr.type == pa.list_(pa.int64())
assert arr.type == factory(pa.int64())
assert arr.to_pylist() == [[], [1, 2], None]


Expand Down Expand Up @@ -1464,9 +1467,18 @@ def test_sequence_duration_nested_lists():
assert arr.type == pa.list_(pa.duration('us'))
assert arr.to_pylist() == data

arr = pa.array(data, type=pa.list_(pa.duration('ms')))

@pytest.mark.parametrize("factory", [
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
def test_sequence_duration_nested_lists_with_explicit_type(factory):
td1 = datetime.timedelta(1, 1, 1000)
td2 = datetime.timedelta(1, 100)

data = [[td1, None], [td1, td2]]

arr = pa.array(data, type=factory(pa.duration('ms')))
assert len(arr) == 2
assert arr.type == pa.list_(pa.duration('ms'))
assert arr.type == factory(pa.duration('ms'))
assert arr.to_pylist() == data


Expand Down Expand Up @@ -2430,6 +2442,10 @@ def test_array_from_pylist_offset_overflow():
),
([[1, 2, 3]], [pa.scalar([1, 2, 3])], pa.list_(pa.int64())),
([["a", "b"]], [pa.scalar(["a", "b"])], pa.list_(pa.string())),
([[1, 2, 3]], [pa.scalar([1, 2, 3], type=pa.list_view(pa.int64()))],
pa.list_view(pa.int64())),
([["a", "b"]], [pa.scalar(["a", "b"], type=pa.list_view(pa.string()))],
pa.list_view(pa.string())),
(
[1, 2, None],
[pa.scalar(1, type=pa.int8()), pa.scalar(2, type=pa.int8()), None],
Expand Down
Loading

0 comments on commit 7c4f4c2

Please sign in to comment.