Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(autoware_cuda_pointcloud_preprocessor): a cuda-accelerated pointcloud preprocessor #9454

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
af2e884
feat: moved the cuda pointcloud preprocessor and organized from a per…
knzo25 Nov 25, 2024
774e099
chore: fixed incorrect links
knzo25 Nov 25, 2024
be04f76
chore: fixed dead links pt2
knzo25 Nov 25, 2024
db02ec7
chore: fixed spelling errors
knzo25 Nov 25, 2024
5218a4a
chore: json schema fixes
knzo25 Nov 25, 2024
4a9daaf
chore: removed comments and filled the fields
knzo25 Nov 26, 2024
84a4b9f
fix: fixed the adapter for the case when the number of points in the …
knzo25 Nov 26, 2024
1e27534
Merge remote-tracking branch 'awf/main' into feat/cuda_blackboard_poi…
knzo25 Dec 23, 2024
b3c1d72
feat: used the cuda host allocators for aster host to device copies
knzo25 Dec 23, 2024
887f162
Update sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointc…
knzo25 Jan 10, 2025
0b46fb6
Update sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcl…
knzo25 Jan 10, 2025
970055e
Update sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcl…
knzo25 Jan 10, 2025
71b8a6f
style(pre-commit): autofix
pre-commit-ci[bot] Jan 10, 2025
d6ee47f
Update sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointc…
knzo25 Jan 10, 2025
3f6953f
Update sensing/autoware_cuda_pointcloud_preprocessor/README.md
knzo25 Jan 10, 2025
db99ba6
Update sensing/autoware_cuda_pointcloud_preprocessor/README.md
knzo25 Jan 10, 2025
9ce2e06
Update sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcl…
knzo25 Jan 10, 2025
f474900
style(pre-commit): autofix
pre-commit-ci[bot] Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
cmake_minimum_required(VERSION 3.14)
project(autoware_cuda_pointcloud_preprocessor)

find_package(ament_cmake_auto REQUIRED)
find_package(CUDA)
find_package(PCL REQUIRED)

if(NOT ${CUDA_FOUND})
message(WARNING "cuda was not found, so the autoware_cuda_pointcloud_preprocessor package will not be built.")
return()
endif()

ament_auto_find_build_dependencies()

# Default to C++17
if (NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif ()

if (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function)
endif ()

if(BUILD_TESTING)
find_package(ament_lint_auto REQUIRED)
ament_lint_auto_find_test_dependencies()
endif()

include_directories(
include
SYSTEM
${CUDA_INCLUDE_DIRS}
)

list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") # cSpell: ignore expt

cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED
src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
)

target_link_libraries(cuda_pointcloud_preprocessor_lib
${PCL_LIBRARIES}
)

target_include_directories(cuda_pointcloud_preprocessor_lib SYSTEM PRIVATE
${autoware_point_types_INCLUDE_DIRS}
${cuda_blackboard_INCLUDE_DIRS}
${geometry_msgs_INCLUDE_DIRS}
${nebula_common_INCLUDE_DIRS}
${rclcpp_INCLUDE_DIRS}
${rcl_interfaces_INCLUDE_DIRS}
${sensor_msgs_INCLUDE_DIRS}
${tf2_INCLUDE_DIRS}
)

# Targets
ament_auto_add_library(cuda_organized_pointcloud_adapter SHARED
src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
)

target_link_libraries(cuda_organized_pointcloud_adapter
${CUDA_LIBRARIES}
)

ament_auto_add_library(cuda_pointcloud_preprocessor SHARED
src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
)

target_link_libraries(cuda_pointcloud_preprocessor
${CUDA_LIBRARIES}
cuda_pointcloud_preprocessor_lib
)

rclcpp_components_register_node(cuda_organized_pointcloud_adapter
PLUGIN "autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode"
EXECUTABLE cuda_organized_pointcloud_adapter_node
)

rclcpp_components_register_node(cuda_pointcloud_preprocessor
PLUGIN "autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode"
EXECUTABLE cuda_pointcloud_preprocessor_node
)

install(DIRECTORY launch config
DESTINATION share/${PROJECT_NAME}
)

ament_auto_package()

# Set ROS_DISTRO macros
set(ROS_DISTRO $ENV{ROS_DISTRO})
if(${ROS_DISTRO} STREQUAL "rolling")
add_compile_definitions(ROS_DISTRO_ROLLING)
elseif(${ROS_DISTRO} STREQUAL "foxy")
add_compile_definitions(ROS_DISTRO_FOXY)
elseif(${ROS_DISTRO} STREQUAL "galactic")
add_compile_definitions(ROS_DISTRO_GALACTIC)
elseif(${ROS_DISTRO} STREQUAL "humble")
add_compile_definitions(ROS_DISTRO_HUMBLE)
endif()
20 changes: 20 additions & 0 deletions sensing/autoware_cuda_pointcloud_preprocessor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# autoware_cuda_pointcloud_preprocessor

## Purpose

The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce.

To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. <!-- cSpell: ignore GPGPUs >
knzo25 marked this conversation as resolved.
Show resolved Hide resolved

## Inner-workings / Algorithms

A detailed description of each filter's algorithm is available in the following links.

| Filter Name | Description | Detail |
| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- |
| cuda_organized_pointcloud_adapter | Organizes a pointcloud per ring/channel, so that the memory layout allows parallel processing in cuda | [link](docs/cuda-organized-pointcloud-adapter.md) |
| cuda_pointcloud_preprocessor | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s cpu versions. | [link](docs/cuda-pointcloud-preprocessor.md) |
knzo25 marked this conversation as resolved.
Show resolved Hide resolved

## (Optional) Future extensions / Unimplemented parts

The subsample filters implemented in `autoware_pointcloud_preprocessor` will have similar counterparts in this package.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/**:
ros__parameters:

base_frame: base_link
use_imu: true
use_3d_distortion_correction: false
distance_ratio: 1.03
object_length_threshold: 0.1
num_points_threshold: 4

self_crop.min_x: 1.0
self_crop.min_y: 1.0
self_crop.min_z: 1.0
self_crop.max_x: -1.0
self_crop.max_y: -1.0
self_crop.max_z: -1.0
mirror_crop.min_x: 1.0
mirror_crop.min_y: 1.0
mirror_crop.min_z: 1.0
mirror_crop.max_x: -1.0
mirror_crop.max_y: -1.0
mirror_crop.max_z: -1.0
Comment on lines +11 to +22
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having two hard-coded crop-box filters here, a list would be more easily extensible and it should be quite straight-forward to change the implementation.

Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# cuda_organized_pointcloud_adapter

## Purpose

The node `cuda_pointcloud_preprocessor` expects a 2D pointcloud where every row represents a single channel/ring and each row's points are in non decreasing azimuth order.

To utilize the previously mentioned node, this node provides an adapter to convert standard flat pointclouds (`height` equals to 1) to the required 2D tensor.

In addition, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output.

## Inner-workings / Algorithms

To create the required 2D tensor, this node iterates the input pointcloud sequentially, filling the output 2D tensor depending on the input point's channel.

The output tensor's size is also estimated in this node, based on the largest `channel` value and the maximum number of points per channel observed so far.

## Inputs / Outputs

### Input

| Name | Type | Description |
| -------------------- | ------------------------------- | ------------------------- |
| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. |

### Output

| Name | Type | Description |
| -------------------------- | ------------------------------------------------ | ---------------------------------------- |
| `~/output/pointcloud` | `sensor_msgs::msg::PointCloud2` | Processed pointcloud's topic |
| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic |

## Parameters

### Core Parameters

{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }}

## Assumptions / Known limits

- This algorithm assumes that the input points will be in non-decreasing azimuth order (per ring).
- This node expects that the input pointcloud is flat (`height` equals to 1) and follows the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# cuda_pointcloud_preprocessor

## Purpose

This node implements all standard pointcloud preprocessing algorithms applied to a single LiDAR's pointcloud in CUDA.
In particular, this node implements:

- crop boxing (ego-vehicle and ego-vehicle's mirrors)
knzo25 marked this conversation as resolved.
Show resolved Hide resolved
- distortion correction
- ring-based outlier filtering

## Inner-workings / Algorithms

As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms.

In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output.

## Inputs / Outputs

### Input

| Name | Type | Description |
| ------------------------- | ------------------------------------------------ | ----------------------------------------- |
| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. |
| `~/input/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Input pointcloud's type negotiation topic |
| `~/input/twist` | `geometry_msgs::msg::TwistWithCovarianceStamped` | Topic of the twist information. |
| `~/input/imu` | `sensor_msgs::msg::Imu` | Topic of the IMU data. |

### Output

| Name | Type | Description |
| -------------------------- | ------------------------------------------------ | ---------------------------------------- |
| `~/output/pointcloud` | `sensor_msgs::msg::PointCloud2` | Processed pointcloud's topic |
| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic |

## Parameters

### Core Parameters

{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }}

## Assumptions / Known limits

- The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize the GPU use.
knzo25 marked this conversation as resolved.
Show resolved Hide resolved
- This node expects that the input pointcloud follows the `autoware::point_types::PointXYZIRCAEDT` layout and the output pointcloud will use the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package.
- The input pointcloud must be a 2D tensor where each row represents a different channel/ring with its points in non-decreasing azimuth order. Invalid points should contain 0-values. The `cuda_organized_pointcloud_adapter` provides such a pointcloud.
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright 2024 TIER IV, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_
#define AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_

#include <autoware/point_types/types.hpp>
#include <autoware/universe_utils/ros/debug_publisher.hpp>
#include <autoware/universe_utils/system/stop_watch.hpp>
#include <cuda_blackboard/cuda_adaptation.hpp>
#include <cuda_blackboard/cuda_blackboard_publisher.hpp>
#include <cuda_blackboard/cuda_pointcloud2.hpp>
#include <rclcpp/rclcpp.hpp>

#include <sensor_msgs/msg/point_cloud2.hpp>

#include <tf2/transform_datatypes.h>
#include <tf2_ros/buffer.h>
#include <tf2_ros/transform_listener.h>

#include <chrono>
#include <deque>
#include <memory>
#include <vector>

namespace autoware::cuda_organized_pointcloud_adapter
{

class CudaOrganizedPointcloudAdapterNode : public rclcpp::Node
{
public:
explicit CudaOrganizedPointcloudAdapterNode(const rclcpp::NodeOptions & node_options);
~CudaOrganizedPointcloudAdapterNode() = default;

private:
void estimatePointcloudRingInfo(
const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);

bool orderPointcloud(
const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);

// Callback
void pointcloudCallback(
const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);

// Subscriber
rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{};

// CUDA pub
std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_;

std::size_t num_rings_{0};
std::size_t max_points_per_ring_{0};

std::vector<std::size_t> next_ring_index_;
std::vector<autoware::point_types::PointXYZIRCAEDT> buffer_;
// autoware::point_types::PointXYZIRCAEDT * device_buffer_;
cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_;

std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
};

} // namespace autoware::cuda_organized_pointcloud_adapter

#endif // AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_
Loading
Loading