From c3141552e1a00e7086315e8f1ce28ef53ab31d6e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 20 Nov 2024 17:05:24 -0800 Subject: [PATCH] Fix performance regression in quadtree_point_in_polygon by 5x (#1446) Improves on a performance regression identified by @trxcllnt when using `quadtree_point_in_polygon` Seems like the core slowdown is that the passed `points` is a `GeoSeries`, and the x/y points are generated by interleaving them in `GeoColumnAccessor.xy` and then slicing x/y back out. Since these points appear to be held in a `cudf.ListDtype` (i.e. `[x, y]`), I tried avoiding interleaving and slicing the the x and y points individually in the lists using the `cudf.Series.list.get` API but this was failing for x/y points for `.polygons.x`/`.polygons.y` since some cuspatial APIs expect these points to be `float` (not sure if it's a cudf bug in `list.get` or how points as structured for polygons) Instead, I was able to get a 5x speedup by * Caching the interleaving step to avoid doing this multiple times * Slicing a cupy array of points instead of a `cudf.Series` (which uselessly slices an `.index` which is not used)
Benchmarking code ```python # 1.3G points.arrow # 722K polys.arrow import cudf import cupy import cuspatial import pyarrow as pa from time import perf_counter_ns polys = pa.ipc.open_stream("polys.arrow").read_all() polys = cudf.DataFrame.from_arrow(polys).rename(columns={"tract": "poly"}) point = polys["poly"].list.leaves.struct.explode() polys = cuspatial.GeoSeries.from_polygons_xy( point.interleave_columns(), polys["poly"]._column.elements.offsets, polys["poly"]._column.offsets, cupy.arange(0, len(polys) + 1) ) point = pa.ipc.open_stream("points.arrow").read_all() point = cudf.DataFrame.from_arrow(point) min_x = point["x"].min() max_x = point["x"].max() min_y = point["y"].min() max_y = point["y"].max() max_size = 0 max_depth = 16 threshold = 10 while max_size <= threshold: max_depth -= 1 max_size = len(point) / pow(4, max_depth) / 4 scale = max(max_x - min_x, max_y - min_y) / (1 << max_depth) point_xy = cuspatial.GeoSeries.from_points_xy(point.interleave_columns()) point_map, quadtree = ( cuspatial.quadtree_on_points(point_xy, min_x, max_x, min_y, max_y, scale, max_depth, max_size)) t0 = perf_counter_ns() cuspatial.quadtree_point_in_polygon( cuspatial.join_quadtree_and_bounding_boxes( quadtree, cuspatial.polygon_bounding_boxes(polys[0:1]), min_x, max_x, min_y, max_y, scale, max_depth ), quadtree, point_map, point_xy, polys[0:1] ) t1 = perf_counter_ns() print(f"{(t1 - t0) / (10 ** 6)}ms") t0 = perf_counter_ns() poly_offsets = cudf.Series(polys[0:1].polygons.part_offset)._column ring_offsets = cudf.Series(polys[0:1].polygons.ring_offset)._column poly_points_x = cudf.Series(polys[0:1].polygons.x)._column poly_points_y = cudf.Series(polys[0:1].polygons.y)._column from cuspatial._lib import spatial_join cudf.DataFrame._from_data( *spatial_join.quadtree_point_in_polygon( cuspatial.join_quadtree_and_bounding_boxes( quadtree, cuspatial.polygon_bounding_boxes(polys[0:1]), min_x, max_x, min_y, max_y, scale, max_depth ), quadtree, point_map._column, point["x"]._column, point["y"]._column, poly_offsets, ring_offsets, poly_points_x, poly_points_y, ) ) t1 = perf_counter_ns() print(f"{(t1 - t0) / (10 ** 6)}ms") # 127.406344ms # this PR # 644.963021ms # branch 24.10 ```
Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Paul Taylor (https://github.com/trxcllnt) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cuspatial/pull/1446 --- python/cuspatial/cuspatial/core/geoseries.py | 77 +++++++------------ .../cuspatial/cuspatial/core/spatial/join.py | 17 ++-- 2 files changed, 38 insertions(+), 56 deletions(-) diff --git a/python/cuspatial/cuspatial/core/geoseries.py b/python/cuspatial/cuspatial/core/geoseries.py index b8d7c4945..37da66744 100644 --- a/python/cuspatial/cuspatial/core/geoseries.py +++ b/python/cuspatial/cuspatial/core/geoseries.py @@ -221,37 +221,38 @@ def sizes(self): ) class GeoColumnAccessor: - def __init__(self, list_series, meta): + def __init__(self, list_series, meta, typ): self._series = list_series self._col = self._series._column self._meta = meta - self._type = Feature_Enum.POINT + self._type = typ + # Resample the existing features so that the offsets returned + # by `_offset` methods reflect previous slicing, and match + # the values returned by .xy. + existing_indices = self._meta.union_offsets[ + self._meta.input_types == self._type.value + ] + self._existing_features = self._col.take(existing_indices._column) @property def x(self): - return self.xy[::2].reset_index(drop=True) + return cudf.Series(self.xy.values[::2]) @property def y(self): - return self.xy[1::2].reset_index(drop=True) + return cudf.Series(self.xy.values[1::2]) - @property + @cached_property def xy(self): - features = self._get_current_features(self._type) + features = self.column() if hasattr(features, "leaves"): - return cudf.Series(features.leaves().values) + return cudf.Series._from_column(features.leaves()) else: return cudf.Series() - def _get_current_features(self, type): - # Resample the existing features so that the offsets returned - # by `_offset` methods reflect previous slicing, and match - # the values returned by .xy. - existing_indices = self._meta.union_offsets[ - self._meta.input_types == type.value - ] - existing_features = self._col.take(existing_indices._column) - return existing_features + def column(self): + """Return the ListColumn reordered by union offset.""" + return self._existing_features def point_indices(self): # Return a cupy.ndarray containing the index values that each @@ -265,18 +266,10 @@ def point_indices(self): self._meta.input_types != -1 ] - def column(self): - """Return the ListColumn reordered by union offset.""" - return self._get_current_features(self._type) - class MultiPointGeoColumnAccessor(GeoColumnAccessor): - def __init__(self, list_series, meta): - super().__init__(list_series, meta) - self._type = Feature_Enum.MULTIPOINT - @property def geometry_offset(self): - return self._get_current_features(self._type).offsets.values + return self.column().offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the @@ -286,19 +279,13 @@ def point_indices(self): return cp.repeat(self._meta.input_types.index, sizes) class LineStringGeoColumnAccessor(GeoColumnAccessor): - def __init__(self, list_series, meta): - super().__init__(list_series, meta) - self._type = Feature_Enum.LINESTRING - @property def geometry_offset(self): - return self._get_current_features(self._type).offsets.values + return self.column().offsets.values @property def part_offset(self): - return self._get_current_features( - self._type - ).elements.offsets.values + return self.column().elements.offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the @@ -308,25 +295,17 @@ def point_indices(self): return cp.repeat(self._meta.input_types.index, sizes) class PolygonGeoColumnAccessor(GeoColumnAccessor): - def __init__(self, list_series, meta): - super().__init__(list_series, meta) - self._type = Feature_Enum.POLYGON - @property def geometry_offset(self): - return self._get_current_features(self._type).offsets.values + return self.column().offsets.values @property def part_offset(self): - return self._get_current_features( - self._type - ).elements.offsets.values + return self.column().elements.offsets.values @property def ring_offset(self): - return self._get_current_features( - self._type - ).elements.elements.offsets.values + return self.column().elements.elements.offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the @@ -340,27 +319,29 @@ def point_indices(self): @property def points(self): """Access the `PointsArray` of the underlying `GeoArrowBuffers`.""" - return self.GeoColumnAccessor(self._column.points, self._column._meta) + return self.GeoColumnAccessor( + self._column.points, self._column._meta, Feature_Enum.POINT + ) @property def multipoints(self): """Access the `MultiPointArray` of the underlying `GeoArrowBuffers`.""" return self.MultiPointGeoColumnAccessor( - self._column.mpoints, self._column._meta + self._column.mpoints, self._column._meta, Feature_Enum.MULTIPOINT ) @property def lines(self): """Access the `LineArray` of the underlying `GeoArrowBuffers`.""" return self.LineStringGeoColumnAccessor( - self._column.lines, self._column._meta + self._column.lines, self._column._meta, Feature_Enum.LINESTRING ) @property def polygons(self): """Access the `PolygonArray` of the underlying `GeoArrowBuffers`.""" return self.PolygonGeoColumnAccessor( - self._column.polygons, self._column._meta + self._column.polygons, self._column._meta, Feature_Enum.POLYGON ) def __repr__(self): diff --git a/python/cuspatial/cuspatial/core/spatial/join.py b/python/cuspatial/cuspatial/core/spatial/join.py index c237fe9d3..fdaeb8f4a 100644 --- a/python/cuspatial/cuspatial/core/spatial/join.py +++ b/python/cuspatial/cuspatial/core/spatial/join.py @@ -214,14 +214,15 @@ def quadtree_point_in_polygon( raise ValueError( "`polygons` Geoseries must contains only polygons geometries." ) - - points_x = as_column(points.points.x) - points_y = as_column(points.points.y) - - poly_offsets = as_column(polygons.polygons.part_offset) - ring_offsets = as_column(polygons.polygons.ring_offset) - poly_points_x = as_column(polygons.polygons.x) - poly_points_y = as_column(polygons.polygons.y) + points_data = points.points + points_x = as_column(points_data.x) + points_y = as_column(points_data.y) + + polygon_data = polygons.polygons + poly_offsets = as_column(polygon_data.part_offset) + ring_offsets = as_column(polygon_data.ring_offset) + poly_points_x = as_column(polygon_data.x) + poly_points_y = as_column(polygon_data.y) return DataFrame._from_data( *spatial_join.quadtree_point_in_polygon(