Skip to content

Commit

Permalink
Fix performance regression in quadtree_point_in_polygon by 5x (#1446)
Browse files Browse the repository at this point in the history
Improves on a performance regression identified by @trxcllnt when using `quadtree_point_in_polygon`

Seems like the core slowdown is that the passed `points` is a `GeoSeries`, and the x/y points are generated by interleaving them in `GeoColumnAccessor.xy` and then slicing x/y back out.

Since these points appear to be held in a `cudf.ListDtype` (i.e. `[x, y]`), I tried avoiding interleaving and slicing the the x and y points individually in the lists using the `cudf.Series.list.get` API but this was failing for x/y points for `.polygons.x`/`.polygons.y` since some cuspatial APIs expect these points to be `float` (not sure if it's a cudf bug in `list.get` or how points as structured for polygons)

Instead, I was able to get a 5x speedup by

* Caching the interleaving step to avoid doing this multiple times
* Slicing a cupy array of points instead of a `cudf.Series` (which uselessly slices an `.index` which is not used)


<details>
  <summary>Benchmarking code</summary>

  ```python

# 1.3G    points.arrow
# 722K    polys.arrow

import cudf
import cupy
import cuspatial
import pyarrow as pa
from time import perf_counter_ns

polys = pa.ipc.open_stream("polys.arrow").read_all()
polys = cudf.DataFrame.from_arrow(polys).rename(columns={"tract": "poly"})
point = polys["poly"].list.leaves.struct.explode()
polys = cuspatial.GeoSeries.from_polygons_xy(
    point.interleave_columns(),
    polys["poly"]._column.elements.offsets,
    polys["poly"]._column.offsets,
    cupy.arange(0, len(polys) + 1)
)

point = pa.ipc.open_stream("points.arrow").read_all()
point = cudf.DataFrame.from_arrow(point)
min_x = point["x"].min()
max_x = point["x"].max()
min_y = point["y"].min()
max_y = point["y"].max()

max_size = 0
max_depth = 16
threshold = 10
while max_size <= threshold:
    max_depth -= 1
    max_size = len(point) / pow(4, max_depth) / 4

scale = max(max_x - min_x, max_y - min_y) / (1 << max_depth)

point_xy = cuspatial.GeoSeries.from_points_xy(point.interleave_columns())
point_map, quadtree = (
    cuspatial.quadtree_on_points(point_xy,
                                 min_x,
                                 max_x,
                                 min_y,
                                 max_y,
                                 scale,
                                 max_depth,
                                 max_size))

t0 = perf_counter_ns()

cuspatial.quadtree_point_in_polygon(
    cuspatial.join_quadtree_and_bounding_boxes(
        quadtree,
        cuspatial.polygon_bounding_boxes(polys[0:1]),
        min_x,
        max_x,
        min_y,
        max_y,
        scale,
        max_depth
    ),
    quadtree,
    point_map,
    point_xy,
    polys[0:1]
)

t1 = perf_counter_ns()

print(f"{(t1 - t0) / (10 ** 6)}ms")

t0 = perf_counter_ns()
poly_offsets = cudf.Series(polys[0:1].polygons.part_offset)._column
ring_offsets = cudf.Series(polys[0:1].polygons.ring_offset)._column
poly_points_x = cudf.Series(polys[0:1].polygons.x)._column
poly_points_y = cudf.Series(polys[0:1].polygons.y)._column

from cuspatial._lib import spatial_join

cudf.DataFrame._from_data(
    *spatial_join.quadtree_point_in_polygon(
        cuspatial.join_quadtree_and_bounding_boxes(
            quadtree,
            cuspatial.polygon_bounding_boxes(polys[0:1]),
            min_x,
            max_x,
            min_y,
            max_y,
            scale,
            max_depth
        ),
        quadtree,
        point_map._column,
        point["x"]._column,
        point["y"]._column,
        poly_offsets,
        ring_offsets,
        poly_points_x,
        poly_points_y,
    )
)

t1 = perf_counter_ns()

print(f"{(t1 - t0) / (10 ** 6)}ms")

# 127.406344ms  # this PR
# 644.963021ms  # branch 24.10
```

</details>

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Paul Taylor (https://github.com/trxcllnt)
  - Mark Harris (https://github.com/harrism)

URL: #1446
  • Loading branch information
mroeschke authored Nov 21, 2024
1 parent 0a79456 commit c314155
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 56 deletions.
77 changes: 29 additions & 48 deletions python/cuspatial/cuspatial/core/geoseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,37 +221,38 @@ def sizes(self):
)

class GeoColumnAccessor:
def __init__(self, list_series, meta):
def __init__(self, list_series, meta, typ):
self._series = list_series
self._col = self._series._column
self._meta = meta
self._type = Feature_Enum.POINT
self._type = typ
# Resample the existing features so that the offsets returned
# by `_offset` methods reflect previous slicing, and match
# the values returned by .xy.
existing_indices = self._meta.union_offsets[
self._meta.input_types == self._type.value
]
self._existing_features = self._col.take(existing_indices._column)

@property
def x(self):
return self.xy[::2].reset_index(drop=True)
return cudf.Series(self.xy.values[::2])

@property
def y(self):
return self.xy[1::2].reset_index(drop=True)
return cudf.Series(self.xy.values[1::2])

@property
@cached_property
def xy(self):
features = self._get_current_features(self._type)
features = self.column()
if hasattr(features, "leaves"):
return cudf.Series(features.leaves().values)
return cudf.Series._from_column(features.leaves())
else:
return cudf.Series()

def _get_current_features(self, type):
# Resample the existing features so that the offsets returned
# by `_offset` methods reflect previous slicing, and match
# the values returned by .xy.
existing_indices = self._meta.union_offsets[
self._meta.input_types == type.value
]
existing_features = self._col.take(existing_indices._column)
return existing_features
def column(self):
"""Return the ListColumn reordered by union offset."""
return self._existing_features

def point_indices(self):
# Return a cupy.ndarray containing the index values that each
Expand All @@ -265,18 +266,10 @@ def point_indices(self):
self._meta.input_types != -1
]

def column(self):
"""Return the ListColumn reordered by union offset."""
return self._get_current_features(self._type)

class MultiPointGeoColumnAccessor(GeoColumnAccessor):
def __init__(self, list_series, meta):
super().__init__(list_series, meta)
self._type = Feature_Enum.MULTIPOINT

@property
def geometry_offset(self):
return self._get_current_features(self._type).offsets.values
return self.column().offsets.values

def point_indices(self):
# Return a cupy.ndarray containing the index values from the
Expand All @@ -286,19 +279,13 @@ def point_indices(self):
return cp.repeat(self._meta.input_types.index, sizes)

class LineStringGeoColumnAccessor(GeoColumnAccessor):
def __init__(self, list_series, meta):
super().__init__(list_series, meta)
self._type = Feature_Enum.LINESTRING

@property
def geometry_offset(self):
return self._get_current_features(self._type).offsets.values
return self.column().offsets.values

@property
def part_offset(self):
return self._get_current_features(
self._type
).elements.offsets.values
return self.column().elements.offsets.values

def point_indices(self):
# Return a cupy.ndarray containing the index values from the
Expand All @@ -308,25 +295,17 @@ def point_indices(self):
return cp.repeat(self._meta.input_types.index, sizes)

class PolygonGeoColumnAccessor(GeoColumnAccessor):
def __init__(self, list_series, meta):
super().__init__(list_series, meta)
self._type = Feature_Enum.POLYGON

@property
def geometry_offset(self):
return self._get_current_features(self._type).offsets.values
return self.column().offsets.values

@property
def part_offset(self):
return self._get_current_features(
self._type
).elements.offsets.values
return self.column().elements.offsets.values

@property
def ring_offset(self):
return self._get_current_features(
self._type
).elements.elements.offsets.values
return self.column().elements.elements.offsets.values

def point_indices(self):
# Return a cupy.ndarray containing the index values from the
Expand All @@ -340,27 +319,29 @@ def point_indices(self):
@property
def points(self):
"""Access the `PointsArray` of the underlying `GeoArrowBuffers`."""
return self.GeoColumnAccessor(self._column.points, self._column._meta)
return self.GeoColumnAccessor(
self._column.points, self._column._meta, Feature_Enum.POINT
)

@property
def multipoints(self):
"""Access the `MultiPointArray` of the underlying `GeoArrowBuffers`."""
return self.MultiPointGeoColumnAccessor(
self._column.mpoints, self._column._meta
self._column.mpoints, self._column._meta, Feature_Enum.MULTIPOINT
)

@property
def lines(self):
"""Access the `LineArray` of the underlying `GeoArrowBuffers`."""
return self.LineStringGeoColumnAccessor(
self._column.lines, self._column._meta
self._column.lines, self._column._meta, Feature_Enum.LINESTRING
)

@property
def polygons(self):
"""Access the `PolygonArray` of the underlying `GeoArrowBuffers`."""
return self.PolygonGeoColumnAccessor(
self._column.polygons, self._column._meta
self._column.polygons, self._column._meta, Feature_Enum.POLYGON
)

def __repr__(self):
Expand Down
17 changes: 9 additions & 8 deletions python/cuspatial/cuspatial/core/spatial/join.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,15 @@ def quadtree_point_in_polygon(
raise ValueError(
"`polygons` Geoseries must contains only polygons geometries."
)

points_x = as_column(points.points.x)
points_y = as_column(points.points.y)

poly_offsets = as_column(polygons.polygons.part_offset)
ring_offsets = as_column(polygons.polygons.ring_offset)
poly_points_x = as_column(polygons.polygons.x)
poly_points_y = as_column(polygons.polygons.y)
points_data = points.points
points_x = as_column(points_data.x)
points_y = as_column(points_data.y)

polygon_data = polygons.polygons
poly_offsets = as_column(polygon_data.part_offset)
ring_offsets = as_column(polygon_data.ring_offset)
poly_points_x = as_column(polygon_data.x)
poly_points_y = as_column(polygon_data.y)

return DataFrame._from_data(
*spatial_join.quadtree_point_in_polygon(
Expand Down

0 comments on commit c314155

Please sign in to comment.