diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt index 210cc33058a..7821b4a8f97 100644 --- a/cpp/open3d/core/CMakeLists.txt +++ b/cpp/open3d/core/CMakeLists.txt @@ -102,6 +102,7 @@ open3d_sycl_target_sources(core_impl PRIVATE linalg/LeastSquaresSYCL.cpp linalg/LUSYCL.cpp linalg/MatmulSYCL.cpp + nns/KnnSearchOpsSYCL.cpp linalg/SolveSYCL.cpp linalg/SVDSYCL.cpp linalg/TriSYCL.cpp diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.cpp b/cpp/open3d/core/nns/FixedRadiusIndex.cpp index 167a6701a32..a7316b1c8ff 100644 --- a/cpp/open3d/core/nns/FixedRadiusIndex.cpp +++ b/cpp/open3d/core/nns/FixedRadiusIndex.cpp @@ -104,8 +104,16 @@ bool FixedRadiusIndex::SetTensorData(const Tensor &dataset_points, CALL_BUILD(double, BuildSpatialHashTableCUDA) #else utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " + "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with " "-DBUILD_CUDA_MODULE=ON."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + return true; +#else + utility::LogError( + "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with " + "-DBUILD_SYCL_MODULE=ON."); #endif } else { CALL_BUILD(float, BuildSpatialHashTableCPU) @@ -171,8 +179,18 @@ std::tuple FixedRadiusIndex::SearchRadius( }); #else utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " + "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with " "-DBUILD_CUDA_MODULE=ON."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { + FixedRadiusSearchSYCL(RADIUS_PARAMETERS); + }); +#else + utility::LogError( + "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with " + "-DBUILD_SYCL_MODULE=ON."); #endif } else { DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { @@ -240,8 +258,18 @@ std::tuple FixedRadiusIndex::SearchHybrid( }); #else utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " + "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with " "-DBUILD_CUDA_MODULE=ON."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { + HybridSearchSYCL(HYBRID_PARAMETERS); + }); +#else + utility::LogError( + "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with " + "-DBUILD_SYCL_MODULE=ON."); #endif } else { DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.h b/cpp/open3d/core/nns/FixedRadiusIndex.h index 838c1ccc484..d3247bb7d29 100644 --- a/cpp/open3d/core/nns/FixedRadiusIndex.h +++ b/cpp/open3d/core/nns/FixedRadiusIndex.h @@ -376,6 +376,40 @@ void HybridSearchCUDA(const Tensor& points, Tensor& neighbors_distance); #endif +#ifdef BUILD_SYCL_MODULE +template +void FixedRadiusSearchSYCL(const Tensor& points, + const Tensor& queries, + double radius, + const Tensor& points_row_splits, + const Tensor& queries_row_splits, + const Tensor& hash_table_splits, + const Tensor& hash_table_index, + const Tensor& hash_table_cell_splits, + const Metric metric, + const bool ignore_query_point, + const bool return_distances, + const bool sort, + Tensor& neighbors_index, + Tensor& neighbors_row_splits, + Tensor& neighbors_distance); + +template +void HybridSearchSYCL(const Tensor& points, + const Tensor& queries, + double radius, + int max_knn, + const Tensor& points_row_splits, + const Tensor& queries_row_splits, + const Tensor& hash_table_splits, + const Tensor& hash_table_index, + const Tensor& hash_table_cell_splits, + const Metric metric, + Tensor& neighbors_index, + Tensor& neighbors_count, + Tensor& neighbors_distance); +#endif + /// \class FixedRadiusIndex /// /// \brief FixedRadiusIndex for nearest neighbor range search. diff --git a/cpp/open3d/core/nns/KnnIndex.cpp b/cpp/open3d/core/nns/KnnIndex.cpp index 2d1f5de6b51..a2c24233a2e 100644 --- a/cpp/open3d/core/nns/KnnIndex.cpp +++ b/cpp/open3d/core/nns/KnnIndex.cpp @@ -67,6 +67,17 @@ bool KnnIndex::SetTensorData(const Tensor& dataset_points, utility::LogError( "GPU Tensor is not supported when -DBUILD_CUDA_MODULE=OFF. " "Please recompile Open3d With -DBUILD_CUDA_MODULE=ON."); +#endif + } else if (dataset_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + dataset_points_ = dataset_points.Contiguous(); + points_row_splits_ = points_row_splits.Contiguous(); + index_dtype_ = index_dtype; + return true; +#else + utility::LogError( + "SYCL Tensor is not supported when -DBUILD_SYCL_MODULE=OFF. " + "Please recompile Open3D with -DBUILD_SYCL_MODULE=ON."); #endif } else { utility::LogError( @@ -124,13 +135,24 @@ std::pair KnnIndex::SearchKnn(const Tensor& query_points, }); #else utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " + "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with " "-DBUILD_CUDA_MODULE=ON."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + const Dtype index_dtype = GetIndexDtype(); + DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { + KnnSearchSYCL(KNN_PARAMETERS); + }); +#else + utility::LogError( + "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with " + "-DBUILD_SYCL_MODULE=ON."); #endif } else { utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " - "-DBUILD_CUDA_MODULE=ON."); + "KnnIndex only supports CUDA and SYCL tensors. Please use " + "NanoFlannIndex instead for CPU tensors."); } return std::make_pair(neighbors_index, neighbors_distance); } diff --git a/cpp/open3d/core/nns/KnnIndex.h b/cpp/open3d/core/nns/KnnIndex.h index 458a3dd0b35..4ac3b7193f8 100644 --- a/cpp/open3d/core/nns/KnnIndex.h +++ b/cpp/open3d/core/nns/KnnIndex.h @@ -29,14 +29,28 @@ void KnnSearchCUDA(const Tensor& points, Tensor& neighbors_distance); #endif +#ifdef BUILD_SYCL_MODULE +template +void KnnSearchSYCL(const Tensor& points, + const Tensor& points_row_splits, + const Tensor& queries, + const Tensor& queries_row_splits, + int knn, + Tensor& neighbors_index, + Tensor& neighbors_row_splits, + Tensor& neighbors_distance); +#endif + class KnnIndex : public NNSIndex { public: KnnIndex(); /// \brief Parameterized Constructor. /// - /// \param dataset_points Provides a set of data points as Tensor for KDTree - /// construction. + /// \param dataset_points Provides a set of data points as Tensor for + /// nearest neighbor search. CPU tensors use NanoFlann through + /// open3d::core::nns::NearestNeighborSearch, while CUDA and SYCL tensors + /// are handled by this class. KnnIndex(const Tensor& dataset_points); KnnIndex(const Tensor& dataset_points, const Dtype& index_dtype); ~KnnIndex(); diff --git a/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp new file mode 100644 index 00000000000..2d20e4b86fa --- /dev/null +++ b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp @@ -0,0 +1,936 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include + +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/linalg/AddMM.h" +#include "open3d/core/nns/FixedRadiusIndex.h" +#include "open3d/core/nns/KnnIndex.h" +#include "open3d/core/nns/NeighborSearchAllocator.h" +#include "open3d/utility/Helper.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +namespace nns { + +// SYCL nearest-neighbor search uses tiled distance evaluation with AddMM, +// batched threshold/count helpers, and query-local top-k selection/merge. +namespace { + +// Chooses a query/point tile shape that limits temporary distance storage. +inline void ChooseTileSizeSYCL(int64_t num_queries, + int64_t num_points, + int64_t element_size, + int64_t& tile_queries, + int64_t& tile_points) { + constexpr int64_t kTargetTileBytes = 8 * 1024 * 1024; + tile_queries = std::min(num_queries, 128); + tile_queries = std::max(tile_queries, 1); + tile_points = std::max( + kTargetTileBytes / (tile_queries * element_size), int64_t(256)); + tile_points = std::min(tile_points, num_points); + tile_points = std::max(tile_points, 1); +} + +// Computes squared L2 distances from one query to all points with a SYCL +// kernel. This path is still used by the small single-row helpers below. +template +void ComputeSquaredDistancesSYCL(const Device& device, + const T* points_ptr, + int64_t num_points, + int64_t dimension, + const T* query_ptr, + T* distances_ptr) { + if (num_points == 0) { + return; + } + + constexpr int64_t kWorkGroupSize = 128; + const int64_t global_size = + utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + queue.submit([&](sycl::handler& cgh) { + sycl::local_accessor local_query(sycl::range<1>(dimension), cgh); + cgh.parallel_for( + sycl::nd_range<1>(sycl::range<1>(global_size), + sycl::range<1>(kWorkGroupSize)), + [=](sycl::nd_item<1> item) { + const int64_t lid = item.get_local_linear_id(); + for (int64_t d = lid; d < dimension; + d += item.get_local_range(0)) { + local_query[d] = query_ptr[d]; + } + item.barrier(sycl::access::fence_space::local_space); + + const int64_t point_idx = item.get_global_linear_id(); + if (point_idx >= num_points) { + return; + } + + const T* point = points_ptr + point_idx * dimension; + T dist = 0; + for (int64_t d = 0; d < dimension; ++d) { + const T diff = point[d] - local_query[d]; + dist += diff * diff; + } + distances_ptr[point_idx] = dist; + }); + }); +} + +// Counts how many entries per query are within a threshold using one launch per +// tile, avoiding per-query device algorithm dispatch. +template +void CountWithinThresholdQueriesSYCL(const Device& device, + const T* distances_ptr, + int64_t distance_query_stride, + int64_t num_queries, + int64_t num_points, + T threshold, + int64_t* counts_ptr) { + if (num_queries == 0 || num_points == 0) { + return; + } + + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) { + const int64_t query_idx = id[0]; + int64_t local_count = 0; + const T* query_distances = + distances_ptr + query_idx * distance_query_stride; + for (int64_t point_idx = 0; point_idx < num_points; ++point_idx) { + if (query_distances[point_idx] <= threshold) { + ++local_count; + } + } + counts_ptr[query_idx] += local_count; + }); +} + +// Appends per-query threshold matches into the final neighbor buffers using +// caller-provided query offsets. +template +void GatherWithinThresholdQueriesSYCL(const Device& device, + const T* distances_ptr, + int64_t distance_query_stride, + int64_t num_queries, + int64_t num_points, + T threshold, + TIndex index_offset, + int64_t* offsets_ptr, + TIndex* out_indices_ptr, + T* out_distances_ptr) { + if (num_queries == 0 || num_points == 0) { + return; + } + + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) { + const int64_t query_idx = id[0]; + const T* query_distances = + distances_ptr + query_idx * distance_query_stride; + int64_t offset = offsets_ptr[query_idx]; + for (int64_t point_idx = 0; point_idx < num_points; ++point_idx) { + const T dist = query_distances[point_idx]; + if (dist <= threshold) { + out_indices_ptr[offset] = index_offset + point_idx; + if (out_distances_ptr != nullptr) { + out_distances_ptr[offset] = dist; + } + ++offset; + } + } + offsets_ptr[query_idx] = offset; + }); +} + +// Selects the smallest k distances per query with OneDPL partial_sort while +// reusing scratch index buffers allocated by the caller. +template +void SelectTopKQueriesSYCL(const Device& device, + const T* distances_ptr, + int64_t distance_query_stride, + int64_t num_queries, + int64_t num_points, + int64_t knn, + TIndex index_offset, + TIndex* scratch_indices_ptr, + int64_t scratch_query_stride, + TIndex* out_indices_ptr, + T* out_distances_ptr, + int64_t out_query_stride, + bool use_threshold = false, + T threshold = T(0)) { + if (num_queries == 0 || num_points == 0 || knn <= 0) { + return; + } + + const T inf = std::numeric_limits::max(); + const int64_t actual_knn = std::min(knn, num_points); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + + queue.parallel_for( + sycl::range<2>(num_queries, num_points), [=](sycl::id<2> id) { + scratch_indices_ptr[id[0] * scratch_query_stride + id[1]] = + static_cast(id[1]); + }); + + for (int64_t query_idx = 0; query_idx < num_queries; ++query_idx) { + TIndex* query_indices = + scratch_indices_ptr + query_idx * scratch_query_stride; + const T* query_distances = + distances_ptr + query_idx * distance_query_stride; + std::partial_sort(policy, query_indices, query_indices + actual_knn, + query_indices + num_points, + [query_distances](TIndex lhs, TIndex rhs) { + const T lhs_dist = query_distances[lhs]; + const T rhs_dist = query_distances[rhs]; + if (lhs_dist < rhs_dist) { + return true; + } + if (rhs_dist < lhs_dist) { + return false; + } + return lhs < rhs; + }); + } + + queue.parallel_for(sycl::range<2>(num_queries, knn), [=](sycl::id<2> id) { + const int64_t query_idx = id[0]; + const int64_t k = id[1]; + TIndex* query_out_indices = + out_indices_ptr + query_idx * out_query_stride; + T* query_out_distances = + out_distances_ptr + query_idx * out_query_stride; + + if (k >= actual_knn) { + query_out_indices[k] = TIndex(-1); + query_out_distances[k] = inf; + return; + } + + const TIndex local_idx = + scratch_indices_ptr[query_idx * scratch_query_stride + k]; + const T dist = + distances_ptr[query_idx * distance_query_stride + local_idx]; + if (use_threshold && dist > threshold) { + query_out_indices[k] = TIndex(-1); + query_out_distances[k] = inf; + return; + } + query_out_indices[k] = index_offset + local_idx; + query_out_distances[k] = dist; + }); +} + +// Merges two sorted top-k query buffers into a new top-k query buffer with +// OneDPL partial_sort over 2k candidates. +template +void MergeTopKQueriesSYCL(const Device& device, + const T* current_distances_ptr, + const TIndex* current_indices_ptr, + int64_t current_query_stride, + const T* candidate_distances_ptr, + const TIndex* candidate_indices_ptr, + int64_t candidate_query_stride, + int64_t num_queries, + int64_t knn, + TIndex* scratch_indices_ptr, + int64_t scratch_query_stride, + TIndex* out_indices_ptr, + T* out_distances_ptr, + int64_t out_query_stride) { + if (num_queries == 0 || knn <= 0) { + return; + } + + const T inf = std::numeric_limits::max(); + const int64_t combined_cols = 2 * knn; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + + queue.parallel_for( + sycl::range<2>(num_queries, combined_cols), [=](sycl::id<2> id) { + scratch_indices_ptr[id[0] * scratch_query_stride + id[1]] = + static_cast(id[1]); + }); + + for (int64_t query_idx = 0; query_idx < num_queries; ++query_idx) { + TIndex* query_indices = + scratch_indices_ptr + query_idx * scratch_query_stride; + const T* query_current_dist = + current_distances_ptr + query_idx * current_query_stride; + const TIndex* query_current_idx = + current_indices_ptr + query_idx * current_query_stride; + const T* query_candidate_dist = + candidate_distances_ptr + query_idx * candidate_query_stride; + const TIndex* query_candidate_idx = + candidate_indices_ptr + query_idx * candidate_query_stride; + + std::partial_sort( + policy, query_indices, query_indices + knn, + query_indices + combined_cols, + [query_current_dist, query_current_idx, query_candidate_dist, + query_candidate_idx, knn](TIndex lhs, TIndex rhs) { + const bool lhs_is_current = lhs < knn; + const bool rhs_is_current = rhs < knn; + const TIndex lhs_idx = + lhs_is_current ? query_current_idx[lhs] + : query_candidate_idx[lhs - knn]; + const TIndex rhs_idx = + rhs_is_current ? query_current_idx[rhs] + : query_candidate_idx[rhs - knn]; + const T lhs_dist = + lhs_is_current ? query_current_dist[lhs] + : query_candidate_dist[lhs - knn]; + const T rhs_dist = + rhs_is_current ? query_current_dist[rhs] + : query_candidate_dist[rhs - knn]; + const bool lhs_valid = lhs_idx >= 0; + const bool rhs_valid = rhs_idx >= 0; + if (lhs_valid != rhs_valid) { + return lhs_valid; + } + if (lhs_dist < rhs_dist) { + return true; + } + if (rhs_dist < lhs_dist) { + return false; + } + return lhs_idx < rhs_idx; + }); + } + + queue.parallel_for(sycl::range<2>(num_queries, knn), [=](sycl::id<2> id) { + const int64_t query_idx = id[0]; + const int64_t k = id[1]; + TIndex* query_out_indices = + out_indices_ptr + query_idx * out_query_stride; + T* query_out_distances = + out_distances_ptr + query_idx * out_query_stride; + const TIndex source = + scratch_indices_ptr[query_idx * scratch_query_stride + k]; + const bool is_current = source < knn; + const int64_t offset = is_current ? source : source - knn; + const TIndex idx = + is_current + ? current_indices_ptr[query_idx * current_query_stride + + offset] + : candidate_indices_ptr[query_idx * + candidate_query_stride + + offset]; + const T dist = + is_current + ? current_distances_ptr[query_idx * + current_query_stride + + offset] + : candidate_distances_ptr + [query_idx * candidate_query_stride + offset]; + if (idx < 0) { + query_out_indices[k] = TIndex(-1); + query_out_distances[k] = inf; + } else { + query_out_indices[k] = idx; + query_out_distances[k] = dist; + } + }); +} + +// Finalizes hybrid-search counts and clears any unused tail elements in the +// fixed-width output tensors. +template +void FinalizeHybridResultsSYCL(const Device& device, + const int64_t* counts_ptr, + int64_t num_queries, + int64_t max_knn, + TIndex* neighbors_index_ptr, + T* neighbors_distance_ptr, + TIndex* neighbors_count_ptr) { + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) { + const int64_t query_idx = id[0]; + const int64_t clipped_count = + std::min(counts_ptr[query_idx], max_knn); + neighbors_count_ptr[query_idx] = static_cast(clipped_count); + for (int64_t k = clipped_count; k < max_knn; ++k) { + neighbors_index_ptr[query_idx * max_knn + k] = TIndex(-1); + neighbors_distance_ptr[query_idx * max_knn + k] = T(0); + } + }); +} + +} // namespace + +// Batched KNN search with tiled AddMM distance evaluation and query-local top-k +// selection/merge on each tile. +template +void KnnSearchSYCL(const Tensor& points, + const Tensor& points_row_splits, + const Tensor& queries, + const Tensor& queries_row_splits, + int knn, + Tensor& neighbors_index, + Tensor& neighbors_row_splits, + Tensor& neighbors_distance) { + const Device device = points.GetDevice(); + const Dtype dtype = points.GetDtype(); + const Dtype index_dtype = Dtype::FromType(); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + const int batch_size = points_row_splits.GetShape(0) - 1; + std::vector> batch_output_allocators( + batch_size, NeighborSearchAllocator(device)); + + int64_t* neighbors_row_splits_ptr = + neighbors_row_splits.GetDataPtr(); + int64_t last_neighbors_count = 0; + int64_t batch_knn = 0; + + for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) { + const int64_t point_begin = + points_row_splits[batch_idx].Item(); + const int64_t point_end = + points_row_splits[batch_idx + 1].Item(); + const int64_t query_begin = + queries_row_splits[batch_idx].Item(); + const int64_t query_end = + queries_row_splits[batch_idx + 1].Item(); + + const Tensor points_i = points.Slice(0, point_begin, point_end); + const Tensor queries_i = queries.Slice(0, query_begin, query_end); + const int64_t num_points_i = points_i.GetShape(0); + const int64_t num_queries_i = queries_i.GetShape(0); + batch_knn = std::min(knn, num_points_i); + + neighbors_row_splits_ptr[query_begin] = last_neighbors_count; + for (int64_t q = 0; q < num_queries_i; ++q) { + neighbors_row_splits_ptr[query_begin + q + 1] = + last_neighbors_count + (q + 1) * batch_knn; + } + last_neighbors_count += num_queries_i * batch_knn; + + TIndex* indices_ptr; + T* distances_ptr; + batch_output_allocators[batch_idx].AllocIndices( + &indices_ptr, num_queries_i * batch_knn, TIndex(-1)); + batch_output_allocators[batch_idx].AllocDistances( + &distances_ptr, num_queries_i * batch_knn, T(0)); + + Tensor out_indices = + batch_output_allocators[batch_idx].NeighborsIndex().View( + {num_queries_i, batch_knn}); + Tensor out_distances = + batch_output_allocators[batch_idx].NeighborsDistance().View( + {num_queries_i, batch_knn}); + + // Precompute norms for batched distance evaluation. + Tensor point_norms = points_i.Mul(points_i).Sum({1}); + Tensor query_norms = queries_i.Mul(queries_i).Sum({1}); + + int64_t tile_queries = 0; + int64_t tile_points = 0; + ChooseTileSizeSYCL(num_queries_i, num_points_i, sizeof(T), tile_queries, + tile_points); + + Tensor temp_distances = + Tensor::Empty({tile_queries, tile_points}, dtype, device); + Tensor tile_sort_indices = + Tensor::Empty({tile_queries, tile_points}, index_dtype, device); + Tensor tile_top_indices = + Tensor::Empty({tile_queries, batch_knn}, index_dtype, device); + Tensor tile_top_distances = + Tensor::Empty({tile_queries, batch_knn}, dtype, device); + Tensor best_indices = + Tensor::Empty({tile_queries, batch_knn}, index_dtype, device); + Tensor best_distances = + Tensor::Empty({tile_queries, batch_knn}, dtype, device); + Tensor merged_indices = + Tensor::Empty({tile_queries, batch_knn}, index_dtype, device); + Tensor merged_distances = + Tensor::Empty({tile_queries, batch_knn}, dtype, device); + Tensor merge_sort_indices = Tensor::Empty({tile_queries, 2 * batch_knn}, + index_dtype, device); + + for (int64_t q = 0; q < num_queries_i; q += tile_queries) { + int64_t num_queries_iter = + std::min(tile_queries, num_queries_i - q); + Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter); + Tensor query_norms_tile = + query_norms.Slice(0, q, q + num_queries_iter); + + Tensor best_indices_view = + best_indices.Slice(0, 0, num_queries_iter); + Tensor best_distances_view = + best_distances.Slice(0, 0, num_queries_iter); + best_indices_view.Fill(TIndex(-1)); + best_distances_view.Fill(std::numeric_limits::max()); + + for (int64_t p = 0; p < num_points_i; p += tile_points) { + int64_t num_points_iter = + std::min(tile_points, num_points_i - p); + Tensor points_tile = points_i.Slice(0, p, p + num_points_iter); + Tensor point_norms_tile = + point_norms.Slice(0, p, p + num_points_iter); + Tensor temp_distances_view = + temp_distances.Slice(0, 0, num_queries_iter) + .Slice(1, 0, num_points_iter); + + AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0, + 0.0); + temp_distances_view.Add_( + point_norms_tile.View({1, num_points_iter})); + temp_distances_view.Add_( + query_norms_tile.View({num_queries_iter, 1})); + + Tensor tile_top_indices_view = + tile_top_indices.Slice(0, 0, num_queries_iter); + Tensor tile_top_distances_view = + tile_top_distances.Slice(0, 0, num_queries_iter); + Tensor merged_indices_view = + merged_indices.Slice(0, 0, num_queries_iter); + Tensor merged_distances_view = + merged_distances.Slice(0, 0, num_queries_iter); + + SelectTopKQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, batch_knn, TIndex(p), + tile_sort_indices.GetDataPtr(), + tile_sort_indices.GetStride(0), + tile_top_indices_view.GetDataPtr(), + tile_top_distances_view.GetDataPtr(), batch_knn); + MergeTopKQueriesSYCL( + device, best_distances_view.GetDataPtr(), + best_indices_view.GetDataPtr(), batch_knn, + tile_top_distances_view.GetDataPtr(), + tile_top_indices_view.GetDataPtr(), batch_knn, + num_queries_iter, batch_knn, + merge_sort_indices.GetDataPtr(), + merge_sort_indices.GetStride(0), + merged_indices_view.GetDataPtr(), + merged_distances_view.GetDataPtr(), batch_knn); + + best_indices_view.AsRvalue() = merged_indices_view; + best_distances_view.AsRvalue() = merged_distances_view; + } + + out_indices.Slice(0, q, q + num_queries_iter).AsRvalue() = + best_indices_view; + out_distances.Slice(0, q, q + num_queries_iter).AsRvalue() = + best_distances_view; + } + + queue.wait_and_throw(); + } + + if (batch_size == 1) { + neighbors_index = batch_output_allocators[0].NeighborsIndex().View( + {queries.GetShape(0), batch_knn}); + neighbors_distance = + batch_output_allocators[0].NeighborsDistance().View( + {queries.GetShape(0), batch_knn}); + return; + } + + NeighborSearchAllocator output_allocator(device); + int64_t neighbors_size = 0; + for (const auto& allocator : batch_output_allocators) { + neighbors_size += allocator.NeighborsIndex().GetShape(0); + } + + TIndex* neighbors_index_ptr; + T* neighbors_distance_ptr; + output_allocator.AllocIndices(&neighbors_index_ptr, neighbors_size); + output_allocator.AllocDistances(&neighbors_distance_ptr, neighbors_size); + + int64_t offset = 0; + for (const auto& allocator : batch_output_allocators) { + const int64_t batch_size_i = allocator.NeighborsIndex().GetShape(0); + if (batch_size_i == 0) { + continue; + } + MemoryManager::Memcpy(neighbors_index_ptr + offset, device, + allocator.IndicesPtr(), device, + sizeof(TIndex) * batch_size_i); + MemoryManager::Memcpy(neighbors_distance_ptr + offset, device, + allocator.DistancesPtr(), device, + sizeof(T) * batch_size_i); + offset += batch_size_i; + } + neighbors_index = output_allocator.NeighborsIndex(); + neighbors_distance = output_allocator.NeighborsDistance(); +} + +// Fixed-radius search using two tiled passes: one to count matches and one to +// gather indices and optional distances. +template +void FixedRadiusSearchSYCL(const Tensor& points, + const Tensor& queries, + double radius, + const Tensor& points_row_splits, + const Tensor& queries_row_splits, + const Tensor&, + const Tensor&, + const Tensor&, + const Metric metric, + const bool ignore_query_point, + const bool return_distances, + const bool, + Tensor& neighbors_index, + Tensor& neighbors_row_splits, + Tensor& neighbors_distance) { + if (metric != Metric::L2) { + utility::LogError("SYCL fixed radius search only supports L2 metric."); + } + if (ignore_query_point) { + utility::LogError( + "SYCL fixed radius search does not support " + "ignore_query_point."); + } + + const Device device = points.GetDevice(); + const int64_t num_queries = queries.GetShape(0); + const Dtype dtype = points.GetDtype(); + const T threshold = static_cast(radius * radius); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + Tensor counts = Tensor::Zeros({num_queries}, Int64, device); + + int64_t tile_queries = 0; + int64_t tile_points = 0; + ChooseTileSizeSYCL(num_queries, points.GetShape(0), sizeof(T), tile_queries, + tile_points); + Tensor temp_distances = + Tensor::Empty({tile_queries, tile_points}, dtype, device); + + for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; + ++batch_idx) { + const int64_t point_begin = + points_row_splits[batch_idx].Item(); + const int64_t point_end = + points_row_splits[batch_idx + 1].Item(); + const int64_t query_begin = + queries_row_splits[batch_idx].Item(); + const int64_t query_end = + queries_row_splits[batch_idx + 1].Item(); + const Tensor points_i = points.Slice(0, point_begin, point_end); + const Tensor queries_i = queries.Slice(0, query_begin, query_end); + const int64_t num_points_i = points_i.GetShape(0); + const int64_t num_queries_i = queries_i.GetShape(0); + Tensor point_norms = points_i.Mul(points_i).Sum({1}); + Tensor query_norms = queries_i.Mul(queries_i).Sum({1}); + + for (int64_t q = 0; q < num_queries_i; q += tile_queries) { + const int64_t num_queries_iter = + std::min(tile_queries, num_queries_i - q); + Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter); + Tensor query_norms_tile = + query_norms.Slice(0, q, q + num_queries_iter); + int64_t* counts_ptr = + counts.GetDataPtr() + query_begin + q; + + for (int64_t p = 0; p < num_points_i; p += tile_points) { + const int64_t num_points_iter = + std::min(tile_points, num_points_i - p); + Tensor points_tile = points_i.Slice(0, p, p + num_points_iter); + Tensor point_norms_tile = + point_norms.Slice(0, p, p + num_points_iter); + Tensor temp_distances_view = + temp_distances.Slice(0, 0, num_queries_iter) + .Slice(1, 0, num_points_iter); + + AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0, + 0.0); + temp_distances_view.Add_( + point_norms_tile.View({1, num_points_iter})); + temp_distances_view.Add_( + query_norms_tile.View({num_queries_iter, 1})); + CountWithinThresholdQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, threshold, counts_ptr); + } + } + } + + queue.wait_and_throw(); + + Tensor counts_cpu = counts.To(Device("CPU:0")); + const int64_t* counts_cpu_ptr = counts_cpu.GetDataPtr(); + std::vector row_splits(num_queries + 1, 0); + for (int64_t q = 0; q < num_queries; ++q) { + row_splits[q + 1] = row_splits[q] + counts_cpu_ptr[q]; + } + neighbors_row_splits = + Tensor(row_splits, {num_queries + 1}, Int64).To(device); + + std::vector row_offsets(row_splits.begin(), row_splits.end() - 1); + Tensor write_offsets = Tensor(row_offsets, {num_queries}, Int64).To(device); + + NeighborSearchAllocator output_allocator(device); + TIndex* neighbors_index_ptr; + T* neighbors_distance_ptr; + output_allocator.AllocIndices(&neighbors_index_ptr, row_splits.back()); + if (return_distances) { + output_allocator.AllocDistances(&neighbors_distance_ptr, + row_splits.back()); + } else { + output_allocator.AllocDistances(&neighbors_distance_ptr, 0); + } + + for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; + ++batch_idx) { + const int64_t point_begin = + points_row_splits[batch_idx].Item(); + const int64_t point_end = + points_row_splits[batch_idx + 1].Item(); + const int64_t query_begin = + queries_row_splits[batch_idx].Item(); + const int64_t query_end = + queries_row_splits[batch_idx + 1].Item(); + const Tensor points_i = points.Slice(0, point_begin, point_end); + const Tensor queries_i = queries.Slice(0, query_begin, query_end); + const int64_t num_points_i = points_i.GetShape(0); + const int64_t num_queries_i = queries_i.GetShape(0); + Tensor point_norms = points_i.Mul(points_i).Sum({1}); + Tensor query_norms = queries_i.Mul(queries_i).Sum({1}); + + for (int64_t q = 0; q < num_queries_i; q += tile_queries) { + const int64_t num_queries_iter = + std::min(tile_queries, num_queries_i - q); + Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter); + Tensor query_norms_tile = + query_norms.Slice(0, q, q + num_queries_iter); + int64_t* offsets_ptr = + write_offsets.GetDataPtr() + query_begin + q; + + for (int64_t p = 0; p < num_points_i; p += tile_points) { + const int64_t num_points_iter = + std::min(tile_points, num_points_i - p); + Tensor points_tile = points_i.Slice(0, p, p + num_points_iter); + Tensor point_norms_tile = + point_norms.Slice(0, p, p + num_points_iter); + Tensor temp_distances_view = + temp_distances.Slice(0, 0, num_queries_iter) + .Slice(1, 0, num_points_iter); + + AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0, + 0.0); + temp_distances_view.Add_( + point_norms_tile.View({1, num_points_iter})); + temp_distances_view.Add_( + query_norms_tile.View({num_queries_iter, 1})); + GatherWithinThresholdQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, threshold, TIndex(point_begin + p), + offsets_ptr, neighbors_index_ptr, + return_distances ? neighbors_distance_ptr : nullptr); + } + } + } + + queue.wait_and_throw(); + + neighbors_index = output_allocator.NeighborsIndex(); + neighbors_distance = output_allocator.NeighborsDistance(); +} + +// Hybrid search counts all neighbors within radius while keeping only the best +// max_knn results per query in fixed-size output tensors. +template +void HybridSearchSYCL(const Tensor& points, + const Tensor& queries, + double radius, + int max_knn, + const Tensor& points_row_splits, + const Tensor& queries_row_splits, + const Tensor&, + const Tensor&, + const Tensor&, + const Metric metric, + Tensor& neighbors_index, + Tensor& neighbors_count, + Tensor& neighbors_distance) { + if (metric != Metric::L2) { + utility::LogError("SYCL hybrid search only supports L2 metric."); + } + + const Device device = points.GetDevice(); + const int64_t num_queries = queries.GetShape(0); + const Dtype dtype = points.GetDtype(); + const T threshold = static_cast(radius * radius); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + NeighborSearchAllocator output_allocator(device); + TIndex* neighbors_index_ptr; + T* neighbors_distance_ptr; + output_allocator.AllocIndices(&neighbors_index_ptr, num_queries * max_knn, + TIndex(-1)); + output_allocator.AllocDistances(&neighbors_distance_ptr, + num_queries * max_knn, + std::numeric_limits::max()); + + Tensor counts = Tensor::Zeros({num_queries}, Int64, device); + Tensor out_indices = + output_allocator.NeighborsIndex().View({num_queries, max_knn}); + Tensor out_distances = + output_allocator.NeighborsDistance().View({num_queries, max_knn}); + + int64_t tile_queries = 0; + int64_t tile_points = 0; + ChooseTileSizeSYCL(num_queries, points.GetShape(0), sizeof(T), tile_queries, + tile_points); + Tensor temp_distances = + Tensor::Empty({tile_queries, tile_points}, dtype, device); + Tensor tile_sort_indices = Tensor::Empty({tile_queries, tile_points}, + Dtype::FromType(), device); + Tensor tile_top_indices = Tensor::Empty({tile_queries, max_knn}, + Dtype::FromType(), device); + Tensor tile_top_distances = + Tensor::Empty({tile_queries, max_knn}, dtype, device); + Tensor merged_indices = Tensor::Empty({tile_queries, max_knn}, + Dtype::FromType(), device); + Tensor merged_distances = + Tensor::Empty({tile_queries, max_knn}, dtype, device); + Tensor merge_sort_indices = Tensor::Empty( + {tile_queries, 2 * max_knn}, Dtype::FromType(), device); + + for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; + ++batch_idx) { + const int64_t point_begin = + points_row_splits[batch_idx].Item(); + const int64_t point_end = + points_row_splits[batch_idx + 1].Item(); + const int64_t query_begin = + queries_row_splits[batch_idx].Item(); + const int64_t query_end = + queries_row_splits[batch_idx + 1].Item(); + const Tensor points_i = points.Slice(0, point_begin, point_end); + const Tensor queries_i = queries.Slice(0, query_begin, query_end); + const int64_t num_points_i = points_i.GetShape(0); + const int64_t num_queries_i = queries_i.GetShape(0); + Tensor point_norms = points_i.Mul(points_i).Sum({1}); + Tensor query_norms = queries_i.Mul(queries_i).Sum({1}); + + for (int64_t q = 0; q < num_queries_i; q += tile_queries) { + const int64_t num_queries_iter = + std::min(tile_queries, num_queries_i - q); + Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter); + Tensor query_norms_tile = + query_norms.Slice(0, q, q + num_queries_iter); + Tensor best_indices_view = out_indices.Slice( + 0, query_begin + q, query_begin + q + num_queries_iter); + Tensor best_distances_view = out_distances.Slice( + 0, query_begin + q, query_begin + q + num_queries_iter); + int64_t* counts_ptr = + counts.GetDataPtr() + query_begin + q; + + for (int64_t p = 0; p < num_points_i; p += tile_points) { + const int64_t num_points_iter = + std::min(tile_points, num_points_i - p); + Tensor points_tile = points_i.Slice(0, p, p + num_points_iter); + Tensor point_norms_tile = + point_norms.Slice(0, p, p + num_points_iter); + Tensor temp_distances_view = + temp_distances.Slice(0, 0, num_queries_iter) + .Slice(1, 0, num_points_iter); + + AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0, + 0.0); + temp_distances_view.Add_( + point_norms_tile.View({1, num_points_iter})); + temp_distances_view.Add_( + query_norms_tile.View({num_queries_iter, 1})); + + CountWithinThresholdQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, threshold, counts_ptr); + + Tensor tile_top_indices_view = + tile_top_indices.Slice(0, 0, num_queries_iter); + Tensor tile_top_distances_view = + tile_top_distances.Slice(0, 0, num_queries_iter); + Tensor merged_indices_view = + merged_indices.Slice(0, 0, num_queries_iter); + Tensor merged_distances_view = + merged_distances.Slice(0, 0, num_queries_iter); + + SelectTopKQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, max_knn, TIndex(point_begin + p), + tile_sort_indices.GetDataPtr(), + tile_sort_indices.GetStride(0), + tile_top_indices_view.GetDataPtr(), + tile_top_distances_view.GetDataPtr(), max_knn, true, + threshold); + MergeTopKQueriesSYCL( + device, best_distances_view.GetDataPtr(), + best_indices_view.GetDataPtr(), max_knn, + tile_top_distances_view.GetDataPtr(), + tile_top_indices_view.GetDataPtr(), max_knn, + num_queries_iter, max_knn, + merge_sort_indices.GetDataPtr(), + merge_sort_indices.GetStride(0), + merged_indices_view.GetDataPtr(), + merged_distances_view.GetDataPtr(), max_knn); + + best_indices_view.AsRvalue() = merged_indices_view; + best_distances_view.AsRvalue() = merged_distances_view; + } + } + } + + neighbors_index = output_allocator.NeighborsIndex(); + neighbors_distance = output_allocator.NeighborsDistance(); + neighbors_count = + Tensor::Empty({num_queries}, Dtype::FromType(), device); + FinalizeHybridResultsSYCL(device, counts.GetDataPtr(), + num_queries, max_knn, + neighbors_index.GetDataPtr(), + neighbors_distance.GetDataPtr(), + neighbors_count.GetDataPtr()); + queue.wait_and_throw(); +} + +#define INSTANTIATE(T, TIndex) \ + template void KnnSearchSYCL( \ + const Tensor& points, const Tensor& points_row_splits, \ + const Tensor& queries, const Tensor& queries_row_splits, int knn, \ + Tensor& neighbors_index, Tensor& neighbors_row_splits, \ + Tensor& neighbors_distance); \ + template void FixedRadiusSearchSYCL( \ + const Tensor& points, const Tensor& queries, double radius, \ + const Tensor& points_row_splits, const Tensor& queries_row_splits, \ + const Tensor&, const Tensor&, const Tensor&, const Metric metric, \ + const bool ignore_query_point, const bool return_distances, \ + const bool sort, Tensor& neighbors_index, \ + Tensor& neighbors_row_splits, Tensor& neighbors_distance); \ + template void HybridSearchSYCL( \ + const Tensor& points, const Tensor& queries, double radius, \ + int max_knn, const Tensor& points_row_splits, \ + const Tensor& queries_row_splits, const Tensor&, const Tensor&, \ + const Tensor&, const Metric metric, Tensor& neighbors_index, \ + Tensor& neighbors_count, Tensor& neighbors_distance); + +INSTANTIATE(float, int32_t) +INSTANTIATE(float, int64_t) +INSTANTIATE(double, int32_t) +INSTANTIATE(double, int64_t) + +} // namespace nns +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.cpp b/cpp/open3d/core/nns/NearestNeighborSearch.cpp index 9cc8030a35b..f7889f737bb 100644 --- a/cpp/open3d/core/nns/NearestNeighborSearch.cpp +++ b/cpp/open3d/core/nns/NearestNeighborSearch.cpp @@ -21,21 +21,18 @@ bool NearestNeighborSearch::SetIndex() { }; bool NearestNeighborSearch::KnnIndex() { - if (dataset_points_.IsCUDA()) { -#ifdef BUILD_CUDA_MODULE + if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) { knn_index_.reset(new nns::KnnIndex()); return knn_index_->SetTensorData(dataset_points_, index_dtype_); -#else - utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please recompile Open3D with " - "-DBUILD_CUDA_MODULE=ON."); -#endif } else { return SetIndex(); } }; -bool NearestNeighborSearch::MultiRadiusIndex() { return SetIndex(); }; +bool NearestNeighborSearch::MultiRadiusIndex() { + AssertCPU(dataset_points_); + return SetIndex(); +}; bool NearestNeighborSearch::FixedRadiusIndex(std::optional radius) { if (dataset_points_.IsCUDA()) { @@ -53,6 +50,15 @@ bool NearestNeighborSearch::FixedRadiusIndex(std::optional radius) { #endif } else { + if (dataset_points_.IsSYCL()) { + if (!radius.has_value()) { + utility::LogError( + "radius is required for SYCL FixedRadiusIndex."); + } + fixed_radius_index_.reset(new nns::FixedRadiusIndex()); + return fixed_radius_index_->SetTensorData( + dataset_points_, radius.value(), index_dtype_); + } return SetIndex(); } } @@ -72,6 +78,14 @@ bool NearestNeighborSearch::HybridIndex(std::optional radius) { #endif } else { + if (dataset_points_.IsSYCL()) { + if (!radius.has_value()) { + utility::LogError("radius is required for SYCL HybridIndex."); + } + fixed_radius_index_.reset(new nns::FixedRadiusIndex()); + return fixed_radius_index_->SetTensorData( + dataset_points_, radius.value(), index_dtype_); + } return SetIndex(); } }; @@ -80,7 +94,7 @@ std::pair NearestNeighborSearch::KnnSearch( const Tensor& query_points, int knn) { AssertTensorDevice(query_points, dataset_points_.GetDevice()); - if (dataset_points_.IsCUDA()) { + if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) { if (knn_index_) { return knn_index_->SearchKnn(query_points, knn); } else { @@ -99,7 +113,7 @@ std::tuple NearestNeighborSearch::FixedRadiusSearch( const Tensor& query_points, double radius, bool sort) { AssertTensorDevice(query_points, dataset_points_.GetDevice()); - if (dataset_points_.IsCUDA()) { + if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) { if (fixed_radius_index_) { return fixed_radius_index_->SearchRadius(query_points, radius, sort); @@ -117,7 +131,7 @@ std::tuple NearestNeighborSearch::FixedRadiusSearch( std::tuple NearestNeighborSearch::MultiRadiusSearch( const Tensor& query_points, const Tensor& radii) { - AssertNotCUDA(query_points); + AssertCPU(query_points); AssertTensorDtype(query_points, dataset_points_.GetDtype()); AssertTensorDtype(radii, dataset_points_.GetDtype()); @@ -133,7 +147,7 @@ std::tuple NearestNeighborSearch::HybridSearch( const int max_knn) const { AssertTensorDevice(query_points, dataset_points_.GetDevice()); - if (dataset_points_.IsCUDA()) { + if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) { if (fixed_radius_index_) { return fixed_radius_index_->SearchHybrid(query_points, radius, max_knn); @@ -150,11 +164,11 @@ std::tuple NearestNeighborSearch::HybridSearch( } } -void NearestNeighborSearch::AssertNotCUDA(const Tensor& t) const { - if (t.IsCUDA()) { +void NearestNeighborSearch::AssertCPU(const Tensor& t) const { + if (!t.IsCPU()) { utility::LogError( - "TODO: NearestNeighborSearch does not support CUDA tensor " - "yet."); + "NearestNeighborSearch only supports CPU tensors for this " + "operation."); } } diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.h b/cpp/open3d/core/nns/NearestNeighborSearch.h index b9fe14ad1ed..aa22ca352b5 100644 --- a/cpp/open3d/core/nns/NearestNeighborSearch.h +++ b/cpp/open3d/core/nns/NearestNeighborSearch.h @@ -27,14 +27,12 @@ class NearestNeighborSearch { /// Constructor. /// /// \param dataset_points Dataset points for constructing search index. Must - /// be 2D, with shape {n, d}. SYCL tensors are not yet supported. + /// be 2D, with shape {n, d}. // NearestNeighborSearch(const Tensor &dataset_points) // : dataset_points_(dataset_points){}; NearestNeighborSearch(const Tensor &dataset_points, const Dtype &index_dtype = core::Int32) - : dataset_points_(dataset_points), index_dtype_(index_dtype) { - AssertNotSYCL(dataset_points_); - }; + : dataset_points_(dataset_points), index_dtype_(index_dtype) {} ~NearestNeighborSearch(); NearestNeighborSearch(const NearestNeighborSearch &) = delete; NearestNeighborSearch &operator=(const NearestNeighborSearch &) = delete; @@ -47,17 +45,22 @@ class NearestNeighborSearch { /// Set index for multi-radius search. /// + /// Multi-radius search is currently only supported for CPU tensors. + /// /// \return Returns true if building index success, otherwise false. bool MultiRadiusIndex(); /// Set index for fixed-radius search. /// - /// \param radius optional radius parameter. required for gpu fixed radius - /// index. \return Returns true if building index success, otherwise false. + /// \param radius Optional radius parameter. Required for CUDA and SYCL + /// fixed radius indices. + /// \return Returns true if building index success, otherwise false. bool FixedRadiusIndex(std::optional radius = {}); /// Set index for hybrid search. /// + /// \param radius Optional radius parameter. Required for CUDA and SYCL + /// hybrid indices. /// \return Returns true if building index success, otherwise false. bool HybridIndex(std::optional radius = {}); @@ -88,6 +91,7 @@ class NearestNeighborSearch { const Tensor &query_points, double radius, bool sort = true); /// Perform multi-radius search. Each query point has an independent radius. + /// Only CPU tensors are currently supported for this search mode. /// /// \param query_points Query points. Must be 2D, with shape {n, d}. /// \param radii Radii of query points. Each query point has one radius. @@ -121,8 +125,8 @@ class NearestNeighborSearch { private: bool SetIndex(); - /// Assert a Tensor is not CUDA tensoer. This will be removed in the future. - void AssertNotCUDA(const Tensor &t) const; + /// Assert a Tensor is on CPU. + void AssertCPU(const Tensor &t) const; protected: std::unique_ptr nanoflann_index_; diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt index 223c4d4c7a8..560745b6b5d 100644 --- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt +++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt @@ -30,6 +30,12 @@ if (BUILD_CUDA_MODULE) ) endif() +if (BUILD_SYCL_MODULE) + open3d_sycl_target_sources(tgeometry_kernel PRIVATE + TransformSYCL.cpp + ) +endif() + if (WITH_IPP) target_sources(tgeometry_kernel PRIVATE IPPImage.cpp diff --git a/cpp/open3d/t/geometry/kernel/Transform.cpp b/cpp/open3d/t/geometry/kernel/Transform.cpp index 06ec1de454f..185be29bfc2 100644 --- a/cpp/open3d/t/geometry/kernel/Transform.cpp +++ b/cpp/open3d/t/geometry/kernel/Transform.cpp @@ -31,6 +31,12 @@ void TransformPoints(const core::Tensor& transformation, core::Tensor& points) { } else if (points.IsCUDA()) { CUDA_CALL(TransformPointsCUDA, transformation_contiguous, points_contiguous); + } else if (points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TransformPointsSYCL(transformation_contiguous, points_contiguous); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device"); } @@ -53,6 +59,12 @@ void TransformNormals(const core::Tensor& transformation, } else if (normals.IsCUDA()) { CUDA_CALL(TransformNormalsCUDA, transformation_contiguous, normals_contiguous); + } else if (normals.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TransformNormalsSYCL(transformation_contiguous, normals_contiguous); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device"); } @@ -78,6 +90,12 @@ void RotatePoints(const core::Tensor& R, } else if (points.IsCUDA()) { CUDA_CALL(RotatePointsCUDA, R_contiguous, points_contiguous, center_contiguous); + } else if (points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + RotatePointsSYCL(R_contiguous, points_contiguous, center_contiguous); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device"); } @@ -97,6 +115,12 @@ void RotateNormals(const core::Tensor& R, core::Tensor& normals) { RotateNormalsCPU(R_contiguous, normals_contiguous); } else if (normals.IsCUDA()) { CUDA_CALL(RotateNormalsCUDA, R_contiguous, normals_contiguous); + } else if (normals.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + RotateNormalsSYCL(R_contiguous, normals_contiguous); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device"); } diff --git a/cpp/open3d/t/geometry/kernel/Transform.h b/cpp/open3d/t/geometry/kernel/Transform.h index d64d860572e..e51ebae73c4 100644 --- a/cpp/open3d/t/geometry/kernel/Transform.h +++ b/cpp/open3d/t/geometry/kernel/Transform.h @@ -52,6 +52,20 @@ void RotatePointsCUDA(const core::Tensor& R, void RotateNormalsCUDA(const core::Tensor& R, core::Tensor& normals); #endif +#ifdef BUILD_SYCL_MODULE +void TransformPointsSYCL(const core::Tensor& transformation, + core::Tensor& points); + +void TransformNormalsSYCL(const core::Tensor& transformation, + core::Tensor& normals); + +void RotatePointsSYCL(const core::Tensor& R, + core::Tensor& points, + const core::Tensor& center); + +void RotateNormalsSYCL(const core::Tensor& R, core::Tensor& normals); +#endif + } // namespace transform } // namespace kernel } // namespace geometry diff --git a/cpp/open3d/t/geometry/kernel/TransformImpl.h b/cpp/open3d/t/geometry/kernel/TransformImpl.h index 14c08a01e52..cedcc22fd87 100644 --- a/cpp/open3d/t/geometry/kernel/TransformImpl.h +++ b/cpp/open3d/t/geometry/kernel/TransformImpl.h @@ -87,6 +87,8 @@ OPEN3D_HOST_DEVICE OPEN3D_FORCE_INLINE void RotateNormalsKernel( normals_ptr[2] = x[2]; } +#ifndef OPEN3D_SKIP_TRANSFORM_MAIN + #ifdef __CUDACC__ void TransformPointsCUDA #else @@ -167,6 +169,8 @@ void RotateNormalsCPU }); } +#endif // OPEN3D_SKIP_TRANSFORM_MAIN + } // namespace transform } // namespace kernel } // namespace geometry diff --git a/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp b/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp new file mode 100644 index 00000000000..9bfd9078e3a --- /dev/null +++ b/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp @@ -0,0 +1,104 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +// Skip the CPU/CUDA main function definitions; only use the helper templates. +#define OPEN3D_SKIP_TRANSFORM_MAIN +#include "open3d/t/geometry/kernel/TransformImpl.h" +#undef OPEN3D_SKIP_TRANSFORM_MAIN + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" + +namespace open3d { +namespace t { +namespace geometry { +namespace kernel { +namespace transform { + +void TransformPointsSYCL(const core::Tensor& transformation, + core::Tensor& points) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue( + points.GetDevice()); + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(points.GetDtype(), [&]() { + scalar_t* points_ptr = points.GetDataPtr(); + const scalar_t* transformation_ptr = + transformation.GetDataPtr(); + const int64_t n = points.GetLength(); + queue.parallel_for(sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + TransformPointsKernel( + transformation_ptr, + points_ptr + 3 * id[0]); + }) + .wait_and_throw(); + }); +} + +void TransformNormalsSYCL(const core::Tensor& transformation, + core::Tensor& normals) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue( + normals.GetDevice()); + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(normals.GetDtype(), [&]() { + scalar_t* normals_ptr = normals.GetDataPtr(); + const scalar_t* transformation_ptr = + transformation.GetDataPtr(); + const int64_t n = normals.GetLength(); + queue.parallel_for(sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + TransformNormalsKernel( + transformation_ptr, + normals_ptr + 3 * id[0]); + }) + .wait_and_throw(); + }); +} + +void RotatePointsSYCL(const core::Tensor& R, + core::Tensor& points, + const core::Tensor& center) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue( + points.GetDevice()); + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(points.GetDtype(), [&]() { + scalar_t* points_ptr = points.GetDataPtr(); + const scalar_t* R_ptr = R.GetDataPtr(); + const scalar_t* center_ptr = center.GetDataPtr(); + const int64_t n = points.GetLength(); + queue.parallel_for(sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + RotatePointsKernel(R_ptr, + points_ptr + 3 * id[0], + center_ptr); + }) + .wait_and_throw(); + }); +} + +void RotateNormalsSYCL(const core::Tensor& R, core::Tensor& normals) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue( + normals.GetDevice()); + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(normals.GetDtype(), [&]() { + scalar_t* normals_ptr = normals.GetDataPtr(); + const scalar_t* R_ptr = R.GetDataPtr(); + const int64_t n = normals.GetLength(); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + RotateNormalsKernel(R_ptr, normals_ptr + 3 * id[0]); + }) + .wait_and_throw(); + }); +} + +} // namespace transform +} // namespace kernel +} // namespace geometry +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt index a766715e6e7..785d58c85e4 100644 --- a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt +++ b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt @@ -22,6 +22,16 @@ if (BUILD_CUDA_MODULE) ) endif() +if (BUILD_SYCL_MODULE) + open3d_sycl_target_sources(tpipelines_kernel PRIVATE + RegistrationSYCL.cpp + FillInLinearSystemSYCL.cpp + RGBDOdometrySYCL.cpp + TransformationConverterSYCL.cpp + FeatureSYCL.cpp + ) +endif() + open3d_show_and_abort_on_warning(tpipelines_kernel) open3d_set_global_properties(tpipelines_kernel) # The kernels are used in the unit tests, so they cannot be hidden for now. diff --git a/cpp/open3d/t/pipelines/kernel/Feature.cpp b/cpp/open3d/t/pipelines/kernel/Feature.cpp index c73573054aa..5a779ecc48b 100644 --- a/cpp/open3d/t/pipelines/kernel/Feature.cpp +++ b/cpp/open3d/t/pipelines/kernel/Feature.cpp @@ -44,10 +44,20 @@ void ComputeFPFHFeature( if (points_d.IsCPU()) { ComputeFPFHFeatureCPU(points_d, normals_d, indices, distance2, counts_d, fpfhs, mask, map_info_idx_to_point_idx); - } else { + } else if (points_d.IsCUDA()) { core::CUDAScopedDevice scoped_device(points.GetDevice()); CUDA_CALL(ComputeFPFHFeatureCUDA, points_d, normals_d, indices, distance2, counts_d, fpfhs, mask, map_info_idx_to_point_idx); + } else if (points_d.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeFPFHFeatureSYCL(points_d, normals_d, indices, distance2, + counts_d, fpfhs, mask, + map_info_idx_to_point_idx); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif + } else { + utility::LogError("Unimplemented device."); } utility::LogDebug( "[ComputeFPFHFeature] Computed {:d} features from " diff --git a/cpp/open3d/t/pipelines/kernel/Feature.h b/cpp/open3d/t/pipelines/kernel/Feature.h index 6645c9bb6cc..cc1d66f6920 100644 --- a/cpp/open3d/t/pipelines/kernel/Feature.h +++ b/cpp/open3d/t/pipelines/kernel/Feature.h @@ -49,6 +49,19 @@ void ComputeFPFHFeatureCUDA( std::nullopt); #endif +#ifdef BUILD_SYCL_MODULE +void ComputeFPFHFeatureSYCL( + const core::Tensor &points, + const core::Tensor &normals, + const core::Tensor &indices, + const core::Tensor &distance2, + const core::Tensor &counts, + core::Tensor &fpfhs, + const std::optional &mask = std::nullopt, + const std::optional &map_batch_info_idx_to_point_idx = + std::nullopt); +#endif + } // namespace kernel } // namespace pipelines } // namespace t diff --git a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h index c9948b6a62d..5caef81ff49 100644 --- a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h +++ b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h @@ -104,6 +104,7 @@ OPEN3D_HOST_DEVICE void UpdateSPFHFeature(const scalar_t *feature, spfh[idx * 33 + h_index3 + 22] += hist_incr; } +#ifndef OPEN3D_SKIP_FPFH_MAIN #if defined(__CUDACC__) void ComputeFPFHFeatureCUDA #else @@ -291,6 +292,7 @@ void ComputeFPFHFeatureCPU }); }); } +#endif // OPEN3D_SKIP_FPFH_MAIN } // namespace kernel } // namespace pipelines diff --git a/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp new file mode 100644 index 00000000000..8c41acf7aab --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp @@ -0,0 +1,205 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +// Skip the CPU/CUDA main function definitions; only use the helper templates. +#define OPEN3D_SKIP_FPFH_MAIN +#include "open3d/t/pipelines/kernel/FeatureImpl.h" +#undef OPEN3D_SKIP_FPFH_MAIN + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { + +void ComputeFPFHFeatureSYCL( + const core::Tensor &points, + const core::Tensor &normals, + const core::Tensor &indices, + const core::Tensor &distance2, + const core::Tensor &counts, + core::Tensor &fpfhs, + const std::optional &mask, + const std::optional &map_info_idx_to_point_idx) { + const core::Dtype dtype = points.GetDtype(); + const core::Device device = points.GetDevice(); + const int64_t n_points = points.GetLength(); + + const bool filter_fpfh = + mask.has_value() && map_info_idx_to_point_idx.has_value(); + if (mask.has_value() ^ map_info_idx_to_point_idx.has_value()) { + utility::LogError( + "Parameters mask and map_info_idx_to_point_idx must " + "either be both provided or both not provided."); + } + if (filter_fpfh) { + if (mask.value().GetShape()[0] != n_points) { + utility::LogError( + "Parameter mask was provided, but its size {:d} should" + "be equal to the number of points {:d}.", + (int)mask.value().GetShape()[0], n_points); + } + if (map_info_idx_to_point_idx.value().GetShape()[0] != + counts.GetShape()[0] - (indices.GetShape().size() == 1 ? 1 : 0)) { + utility::LogError( + "Parameter map_info_idx_to_point_idx was provided, " + "but its size" + "{:d} should be equal to the size of counts {:d}.", + (int)map_info_idx_to_point_idx.value().GetShape()[0], + (int)counts.GetShape()[0]); + } + } + + core::Tensor map_spfh_info_idx_to_point_idx = + map_info_idx_to_point_idx.value_or( + core::Tensor::Empty({0}, core::Int64, device)); + + const core::Tensor map_fpfh_idx_to_point_idx = + filter_fpfh ? mask.value().NonZero().GetItem( + {core::TensorKey::Index(0)}) + : core::Tensor::Empty({0}, core::Int64, device); + + const int32_t n_fpfh = + filter_fpfh ? map_fpfh_idx_to_point_idx.GetLength() : n_points; + const int32_t n_spfh = + filter_fpfh ? map_spfh_info_idx_to_point_idx.GetLength() : n_points; + + core::Tensor spfhs = + core::Tensor::Zeros({n_spfh, 33}, dtype, fpfhs.GetDevice()); + + core::Tensor map_point_idx_to_spfh_idx; + if (filter_fpfh) { + map_point_idx_to_spfh_idx = core::Tensor::Full( + {n_points}, -1, core::Int64, fpfhs.GetDevice()); + map_point_idx_to_spfh_idx.IndexSet( + {map_spfh_info_idx_to_point_idx}, + core::Tensor::Arange(0, n_spfh, 1, core::Int64, + fpfhs.GetDevice())); + } else { + map_point_idx_to_spfh_idx = + core::Tensor::Empty({0}, core::Int64, fpfhs.GetDevice()); + } + + // Check the nns type (knn = hybrid = false, radius = true). + // The nns radius search mode will resulting a prefix sum 1D tensor. + bool is_radius_search; + int nn_size = 0; + if (indices.GetShape().size() == 1) { + is_radius_search = true; + } else { + is_radius_search = false; + nn_size = indices.GetShape()[1]; + } + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + const scalar_t *points_ptr = points.GetDataPtr(); + const scalar_t *normals_ptr = normals.GetDataPtr(); + const int32_t *indices_ptr = indices.GetDataPtr(); + const scalar_t *distance2_ptr = distance2.GetDataPtr(); + const int32_t *counts_ptr = counts.GetDataPtr(); + scalar_t *spfhs_ptr = spfhs.GetDataPtr(); + scalar_t *fpfhs_ptr = fpfhs.GetDataPtr(); + const int64_t *map_spfh_info_idx_to_point_idx_ptr = + map_spfh_info_idx_to_point_idx.GetDataPtr(); + const int64_t *map_fpfh_idx_to_point_idx_ptr = + map_fpfh_idx_to_point_idx.GetDataPtr(); + const int64_t *map_point_idx_to_spfh_idx_ptr = + map_point_idx_to_spfh_idx.GetDataPtr(); + + // Compute SPFH features for the points. + queue.parallel_for(sycl::range<1>{(size_t)n_spfh}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int64_t workload_point_idx = + filter_fpfh ? map_spfh_info_idx_to_point_idx_ptr + [workload_idx] + : workload_idx; + int64_t idx = 3 * workload_point_idx; + const scalar_t *point = points_ptr + idx; + const scalar_t *normal = normals_ptr + idx; + + const int indice_size = + is_radius_search ? (counts_ptr[workload_idx + 1] - + counts_ptr[workload_idx]) + : counts_ptr[workload_idx]; + + if (indice_size > 1) { + const scalar_t hist_incr = + 100.0 / static_cast(indice_size - 1); + for (int i = 1; i < indice_size; i++) { + const int point_idx = + is_radius_search + ? indices_ptr[i + + counts_ptr[workload_idx]] + : indices_ptr[workload_idx * nn_size + + i]; + + const scalar_t *point_ref = points_ptr + 3 * point_idx; + const scalar_t *normal_ref = + normals_ptr + 3 * point_idx; + scalar_t fea[4] = {0}; + ComputePairFeature(point, normal, point_ref, + normal_ref, fea); + UpdateSPFHFeature(fea, workload_idx, + hist_incr, spfhs_ptr); + } + } + }).wait_and_throw(); + + // Compute FPFH features for the points. + queue.parallel_for(sycl::range<1>{(size_t)n_fpfh}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int64_t workload_spfh_idx = + filter_fpfh ? map_point_idx_to_spfh_idx_ptr + [map_fpfh_idx_to_point_idx_ptr + [workload_idx]] + : workload_idx; + const int indice_size = + is_radius_search ? (counts_ptr[workload_spfh_idx + 1] - + counts_ptr[workload_spfh_idx]) + : counts_ptr[workload_spfh_idx]; + if (indice_size > 1) { + scalar_t sum[3] = {0.0, 0.0, 0.0}; + for (int i = 1; i < indice_size; i++) { + const int idx = + is_radius_search + ? i + counts_ptr[workload_spfh_idx] + : workload_spfh_idx * nn_size + i; + const scalar_t dist = distance2_ptr[idx]; + if (dist == 0.0) continue; + const int32_t spfh_idx = + filter_fpfh ? map_point_idx_to_spfh_idx_ptr + [indices_ptr[idx]] + : indices_ptr[idx]; + for (int j = 0; j < 33; j++) { + const scalar_t val = + spfhs_ptr[spfh_idx * 33 + j] / dist; + sum[j / 11] += val; + fpfhs_ptr[workload_idx * 33 + j] += val; + } + } + for (int j = 0; j < 3; j++) { + sum[j] = sum[j] != 0.0 ? 100.0 / sum[j] : 0.0; + } + for (int j = 0; j < 33; j++) { + fpfhs_ptr[workload_idx * 33 + j] *= sum[j / 11]; + fpfhs_ptr[workload_idx * 33 + j] += + spfhs_ptr[workload_spfh_idx * 33 + j]; + } + } + }).wait_and_throw(); + }); +} + +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp index 9d5e825da9a..f9e358d6636 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp @@ -60,6 +60,13 @@ void FillInRigidAlignmentTerm(core::Tensor &AtA, #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (AtA.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + FillInRigidAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs, + Ri_normal_ps, i, j, threshold); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device"); @@ -124,6 +131,15 @@ void FillInSLACAlignmentTerm(core::Tensor &AtA, #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (AtA.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + FillInSLACAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs, normal_ps, + Ri_normal_ps, RjT_Ri_normal_ps, + cgrid_idx_ps, cgrid_idx_qs, cgrid_ratio_ps, + cgrid_ratio_qs, i, j, n, threshold); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device"); @@ -163,6 +179,14 @@ void FillInSLACRegularizerTerm(core::Tensor &AtA, positions_init, positions_curr, weight, n, anchor_idx); #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (AtA.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + FillInSLACRegularizerTermSYCL( + AtA, Atb, residual, grid_idx, grid_nbs_idx, grid_nbs_mask, + positions_init, positions_curr, weight, n, anchor_idx); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device"); diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h index 1c529813841..69da827ab61 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h @@ -130,6 +130,47 @@ void FillInSLACRegularizerTermCUDA(core::Tensor &AtA, int n, int anchor_idx); +#endif +#ifdef BUILD_SYCL_MODULE +void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_qs, + const core::Tensor &Tj_qs, + const core::Tensor &Ri_normal_ps, + int i, + int j, + float threshold); + +void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_qs, + const core::Tensor &Tj_qs, + const core::Tensor &normal_ps, + const core::Tensor &Ri_normal_ps, + const core::Tensor &RjT_Ri_normal_ps, + const core::Tensor &cgrid_idx_ps, + const core::Tensor &cgrid_idx_qs, + const core::Tensor &cgrid_ratio_qs, + const core::Tensor &cgrid_ratio_ps, + int i, + int j, + int n, + float threshold); + +void FillInSLACRegularizerTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &grid_idx, + const core::Tensor &grid_nbs_idx, + const core::Tensor &grid_nbs_mask, + const core::Tensor &positions_init, + const core::Tensor &positions_curr, + float weight, + int n, + int anchor_idx); + #endif } // namespace kernel diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h index e158809f6a4..21732102836 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h @@ -13,6 +13,7 @@ namespace open3d { namespace t { namespace pipelines { namespace kernel { +#ifndef OPEN3D_SKIP_FILL_IN_LS_MAIN #if defined(__CUDACC__) void FillInRigidAlignmentTermCUDA #else @@ -471,6 +472,7 @@ void FillInSLACRegularizerTermCPU } }); } +#endif // OPEN3D_SKIP_FILL_IN_LS_MAIN } // namespace kernel } // namespace pipelines } // namespace t diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp new file mode 100644 index 00000000000..a43767ba40e --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp @@ -0,0 +1,462 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +// Skip the CPU/CUDA main function definitions; include only helper includes. +#define OPEN3D_SKIP_FILL_IN_LS_MAIN +#include "open3d/t/pipelines/kernel/FillInLinearSystemImpl.h" +#undef OPEN3D_SKIP_FILL_IN_LS_MAIN + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/t/pipelines/kernel/FillInLinearSystem.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { + +void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_ps, + const core::Tensor &Tj_qs, + const core::Tensor &Ri_normal_ps, + int i, + int j, + float threshold) { + core::Device device = AtA.GetDevice(); + int64_t n = Ti_ps.GetLength(); + if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) { + utility::LogError( + "Unable to setup linear system: input length mismatch."); + } + + // First fill in a small 12 x 12 linear system using group reduction. + // kLocalDim = 12*12 + 12 + 1 = 157 elements. + static constexpr int kLocalDim12 = 144; // 12*12 + static constexpr int kLocalDimAtb = 12; + static constexpr int kLocalDimTotal = kLocalDim12 + kLocalDimAtb + 1; + + core::Tensor AtA_local = + core::Tensor::Zeros({12, 12}, core::Float32, device); + core::Tensor Atb_local = core::Tensor::Zeros({12}, core::Float32, device); + + float *AtA_local_ptr = static_cast(AtA_local.GetDataPtr()); + float *Atb_local_ptr = static_cast(Atb_local.GetDataPtr()); + float *residual_ptr = static_cast(residual.GetDataPtr()); + + const float *Ti_ps_ptr = static_cast(Ti_ps.GetDataPtr()); + const float *Tj_qs_ptr = static_cast(Tj_qs.GetDataPtr()); + const float *Ri_normal_ps_ptr = + static_cast(Ri_normal_ps.GetDataPtr()); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + // Private accumulation buffer: [AtA_local(144) | + // Atb_local(12) | residual(1)] + float local_sum[kLocalDimTotal] = {}; + + if (gid < n) { + const float *p_prime = Ti_ps_ptr + 3 * gid; + const float *q_prime = Tj_qs_ptr + 3 * gid; + const float *normal_p_prime = + Ri_normal_ps_ptr + 3 * gid; + + float r = (p_prime[0] - q_prime[0]) * + normal_p_prime[0] + + (p_prime[1] - q_prime[1]) * + normal_p_prime[1] + + (p_prime[2] - q_prime[2]) * + normal_p_prime[2]; + + if (sycl::fabs(r) <= threshold) { + float J_ij[12]; + J_ij[0] = -q_prime[2] * normal_p_prime[1] + + q_prime[1] * normal_p_prime[2]; + J_ij[1] = q_prime[2] * normal_p_prime[0] - + q_prime[0] * normal_p_prime[2]; + J_ij[2] = -q_prime[1] * normal_p_prime[0] + + q_prime[0] * normal_p_prime[1]; + J_ij[3] = normal_p_prime[0]; + J_ij[4] = normal_p_prime[1]; + J_ij[5] = normal_p_prime[2]; + for (int k = 0; k < 6; ++k) { + J_ij[k + 6] = -J_ij[k]; + } + + for (int i_local = 0; i_local < 12; + ++i_local) { + for (int j_local = 0; j_local < 12; + ++j_local) { + local_sum[i_local * 12 + j_local] += + J_ij[i_local] * J_ij[j_local]; + } + local_sum[kLocalDim12 + i_local] += + J_ij[i_local] * r; + } + local_sum[kLocalDim12 + kLocalDimAtb] += r * r; + } + } + + // Group-reduce each element and atomically add to + // the device-side local system buffers. + auto grp = item.get_group(); + for (int k = 0; k < kLocalDim12; ++k) { + float gv = sycl::reduce_over_group( + grp, local_sum[k], sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref( + AtA_local_ptr[k]) += gv; + } + } + for (int k = 0; k < kLocalDimAtb; ++k) { + float gv = sycl::reduce_over_group( + grp, local_sum[kLocalDim12 + k], + sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref( + Atb_local_ptr[k]) += gv; + } + } + { + float gv = sycl::reduce_over_group( + grp, local_sum[kLocalDim12 + kLocalDimAtb], + sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref( + *residual_ptr) += gv; + } + } + }); + }).wait_and_throw(); + + // Then fill-in the large linear system. + std::vector indices_vec(12); + for (int k = 0; k < 6; ++k) { + indices_vec[k] = i * 6 + k; + indices_vec[k + 6] = j * 6 + k; + } + + std::vector indices_i_vec; + std::vector indices_j_vec; + for (int local_i = 0; local_i < 12; ++local_i) { + for (int local_j = 0; local_j < 12; ++local_j) { + indices_i_vec.push_back(indices_vec[local_i]); + indices_j_vec.push_back(indices_vec[local_j]); + } + } + + core::Tensor indices(indices_vec, {12}, core::Int64, device); + core::Tensor indices_i(indices_i_vec, {12 * 12}, core::Int64, device); + core::Tensor indices_j(indices_j_vec, {12 * 12}, core::Int64, device); + + core::Tensor AtA_sub = AtA.IndexGet({indices_i, indices_j}); + AtA.IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.View({12 * 12})); + + core::Tensor Atb_sub = Atb.IndexGet({indices}); + Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1})); +} + +void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_Cps, + const core::Tensor &Tj_Cqs, + const core::Tensor &Cnormal_ps, + const core::Tensor &Ri_Cnormal_ps, + const core::Tensor &RjT_Ri_Cnormal_ps, + const core::Tensor &cgrid_idx_ps, + const core::Tensor &cgrid_idx_qs, + const core::Tensor &cgrid_ratio_qs, + const core::Tensor &cgrid_ratio_ps, + int i, + int j, + int n_frags, + float threshold) { + int64_t n = Ti_Cps.GetLength(); + if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n || + Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n || + cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n || + cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) { + utility::LogError( + "Unable to setup linear system: input length mismatch."); + } + + core::Device device = AtA.GetDevice(); + int n_vars = Atb.GetLength(); + float *AtA_ptr = static_cast(AtA.GetDataPtr()); + float *Atb_ptr = static_cast(Atb.GetDataPtr()); + float *residual_ptr = static_cast(residual.GetDataPtr()); + + const float *Ti_Cps_ptr = static_cast(Ti_Cps.GetDataPtr()); + const float *Tj_Cqs_ptr = static_cast(Tj_Cqs.GetDataPtr()); + const float *Cnormal_ps_ptr = + static_cast(Cnormal_ps.GetDataPtr()); + const float *Ri_Cnormal_ps_ptr = + static_cast(Ri_Cnormal_ps.GetDataPtr()); + const float *RjT_Ri_Cnormal_ps_ptr = + static_cast(RjT_Ri_Cnormal_ps.GetDataPtr()); + + const int *cgrid_idx_ps_ptr = + static_cast(cgrid_idx_ps.GetDataPtr()); + const int *cgrid_idx_qs_ptr = + static_cast(cgrid_idx_qs.GetDataPtr()); + const float *cgrid_ratio_ps_ptr = + static_cast(cgrid_ratio_ps.GetDataPtr()); + const float *cgrid_ratio_qs_ptr = + static_cast(cgrid_ratio_qs.GetDataPtr()); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx; + const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx; + const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx; + const float *Ri_Cnormal_p = Ri_Cnormal_ps_ptr + 3 * workload_idx; + const float *RjTRi_Cnormal_p = + RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx; + + const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx; + const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx; + const float *cgrid_ratio_p = cgrid_ratio_ps_ptr + 8 * workload_idx; + const float *cgrid_ratio_q = cgrid_ratio_qs_ptr + 8 * workload_idx; + + float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] + + (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] + + (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2]; + if (sycl::fabs(r) > threshold) return; + + float J[60]; + int idx[60]; + + J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2]; + J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2]; + J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1]; + J[3] = Ri_Cnormal_p[0]; + J[4] = Ri_Cnormal_p[1]; + J[5] = Ri_Cnormal_p[2]; + + for (int k = 0; k < 6; ++k) { + J[k + 6] = -J[k]; + idx[k + 0] = 6 * i + k; + idx[k + 6] = 6 * j + k; + } + + for (int k = 0; k < 8; ++k) { + J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0]; + J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1]; + J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2]; + idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0; + idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1; + idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2; + } + + for (int k = 0; k < 8; ++k) { + J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0]; + J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1]; + J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2]; + idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0; + idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1; + idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2; + } + + for (int ki = 0; ki < 60; ++ki) { + for (int kj = 0; kj < 60; ++kj) { + float AtA_ij = J[ki] * J[kj]; + int ij = idx[ki] * n_vars + idx[kj]; + sycl::atomic_ref( + AtA_ptr[ij]) += AtA_ij; + } + sycl::atomic_ref( + Atb_ptr[idx[ki]]) += J[ki] * r; + } + sycl::atomic_ref(*residual_ptr) += + r * r; + }).wait_and_throw(); +} + +void FillInSLACRegularizerTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &grid_idx, + const core::Tensor &grid_nbs_idx, + const core::Tensor &grid_nbs_mask, + const core::Tensor &positions_init, + const core::Tensor &positions_curr, + float weight, + int n_frags, + int anchor_idx) { + int64_t n = grid_idx.GetLength(); + int64_t n_vars = Atb.GetLength(); + + core::Device device = AtA.GetDevice(); + float *AtA_ptr = static_cast(AtA.GetDataPtr()); + float *Atb_ptr = static_cast(Atb.GetDataPtr()); + float *residual_ptr = static_cast(residual.GetDataPtr()); + + const int *grid_idx_ptr = static_cast(grid_idx.GetDataPtr()); + const int *grid_nbs_idx_ptr = + static_cast(grid_nbs_idx.GetDataPtr()); + const bool *grid_nbs_mask_ptr = + static_cast(grid_nbs_mask.GetDataPtr()); + const float *positions_init_ptr = + static_cast(positions_init.GetDataPtr()); + const float *positions_curr_ptr = + static_cast(positions_curr.GetDataPtr()); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int idx_i = grid_idx_ptr[workload_idx]; + + const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx; + const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx; + + float cov[3][3] = {{0}}; + float U[3][3], V[3][3], S[3]; + + int cnt = 0; + for (int k = 0; k < 6; ++k) { + if (!mask_nbs[k]) continue; + int idx_k = idx_nbs[k]; + + float diff_ik_init[3] = { + positions_init_ptr[idx_i * 3 + 0] - + positions_init_ptr[idx_k * 3 + 0], + positions_init_ptr[idx_i * 3 + 1] - + positions_init_ptr[idx_k * 3 + 1], + positions_init_ptr[idx_i * 3 + 2] - + positions_init_ptr[idx_k * 3 + 2]}; + float diff_ik_curr[3] = { + positions_curr_ptr[idx_i * 3 + 0] - + positions_curr_ptr[idx_k * 3 + 0], + positions_curr_ptr[idx_i * 3 + 1] - + positions_curr_ptr[idx_k * 3 + 1], + positions_curr_ptr[idx_i * 3 + 2] - + positions_curr_ptr[idx_k * 3 + 2]}; + + for (int ii = 0; ii < 3; ++ii) { + for (int jj = 0; jj < 3; ++jj) { + cov[ii][jj] += diff_ik_init[ii] * diff_ik_curr[jj]; + } + } + ++cnt; + } + + if (cnt < 3) return; + + core::linalg::kernel::svd3x3(*cov, *U, S, *V); + + float R[3][3]; + core::linalg::kernel::transpose3x3_(*U); + core::linalg::kernel::matmul3x3_3x3(*V, *U, *R); + + float d = core::linalg::kernel::det3x3(*R); + if (d < 0) { + U[2][0] = -U[2][0]; + U[2][1] = -U[2][1]; + U[2][2] = -U[2][2]; + core::linalg::kernel::matmul3x3_3x3(*V, *U, *R); + } + + if (idx_i == anchor_idx) { + R[0][0] = R[1][1] = R[2][2] = 1; + R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] = 0; + } + + for (int k = 0; k < 6; ++k) { + if (!mask_nbs[k]) continue; + int idx_k = idx_nbs[k]; + + float diff_ik_init[3] = { + positions_init_ptr[idx_i * 3 + 0] - + positions_init_ptr[idx_k * 3 + 0], + positions_init_ptr[idx_i * 3 + 1] - + positions_init_ptr[idx_k * 3 + 1], + positions_init_ptr[idx_i * 3 + 2] - + positions_init_ptr[idx_k * 3 + 2]}; + float diff_ik_curr[3] = { + positions_curr_ptr[idx_i * 3 + 0] - + positions_curr_ptr[idx_k * 3 + 0], + positions_curr_ptr[idx_i * 3 + 1] - + positions_curr_ptr[idx_k * 3 + 1], + positions_curr_ptr[idx_i * 3 + 2] - + positions_curr_ptr[idx_k * 3 + 2]}; + float R_diff_ik_curr[3]; + core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init, + R_diff_ik_curr); + + float local_r[3]; + local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0]; + local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1]; + local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2]; + + int offset_idx_i = 3 * idx_i + 6 * n_frags; + int offset_idx_k = 3 * idx_k + 6 * n_frags; + + sycl::atomic_ref(*residual_ptr) += + weight * + (local_r[0] * local_r[0] + local_r[1] * local_r[1] + + local_r[2] * local_r[2]); + + for (int axis = 0; axis < 3; ++axis) { + sycl::atomic_ref( + AtA_ptr[(offset_idx_i + axis) * n_vars + + offset_idx_i + axis]) += weight; + sycl::atomic_ref( + AtA_ptr[(offset_idx_k + axis) * n_vars + + offset_idx_k + axis]) += weight; + sycl::atomic_ref( + AtA_ptr[(offset_idx_i + axis) * n_vars + + offset_idx_k + axis]) -= weight; + sycl::atomic_ref( + AtA_ptr[(offset_idx_k + axis) * n_vars + + offset_idx_i + axis]) -= weight; + + sycl::atomic_ref( + Atb_ptr[offset_idx_i + axis]) += + weight * local_r[axis]; + sycl::atomic_ref( + Atb_ptr[offset_idx_k + axis]) -= + weight * local_r[axis]; + } + } + }).wait_and_throw(); +} + +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp index ac61daeaa2c..4a9afaff45f 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp @@ -58,6 +58,15 @@ void ComputeOdometryResultPointToPlane( target_vertex_map, target_normal_map, intrinsics_d, trans_d, delta, inlier_residual, inlier_count, depth_outlier_trunc, depth_huber_delta); + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeOdometryResultPointToPlaneSYCL( + source_vertex_map, target_vertex_map, target_normal_map, + intrinsics_d, trans_d, delta, inlier_residual, inlier_count, + depth_outlier_trunc, depth_huber_delta); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -118,6 +127,16 @@ void ComputeOdometryResultIntensity(const core::Tensor &source_depth, target_intensity_dx, target_intensity_dy, source_vertex_map, intrinsics_d, trans_d, delta, inlier_residual, inlier_count, depth_outlier_trunc, intensity_huber_delta); + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeOdometryResultIntensitySYCL( + source_depth, target_depth, source_intensity, target_intensity, + target_intensity_dx, target_intensity_dy, source_vertex_map, + intrinsics_d, trans_d, delta, inlier_residual, inlier_count, + depth_outlier_trunc, intensity_huber_delta); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -187,6 +206,17 @@ void ComputeOdometryResultHybrid(const core::Tensor &source_depth, source_vertex_map, intrinsics_d, trans_d, delta, inlier_residual, inlier_count, depth_outlier_trunc, depth_huber_delta, intensity_huber_delta); + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeOdometryResultHybridSYCL( + source_depth, target_depth, source_intensity, target_intensity, + target_depth_dx, target_depth_dy, target_intensity_dx, + target_intensity_dy, source_vertex_map, intrinsics_d, trans_d, + delta, inlier_residual, inlier_count, depth_outlier_trunc, + depth_huber_delta, intensity_huber_delta); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -222,6 +252,14 @@ void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map, CUDA_CALL(ComputeOdometryInformationMatrixCUDA, source_vertex_map, target_vertex_map, intrinsic, source_to_target, square_dist_thr, information); + } else if (device.GetType() == core::Device::DeviceType::SYCL) { +#ifdef BUILD_SYCL_MODULE + ComputeOdometryInformationMatrixSYCL( + source_vertex_map, target_vertex_map, intrinsic, + source_to_target, square_dist_thr, information); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h index d1aacbbeac3..3c65eec81e8 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h @@ -125,6 +125,61 @@ void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_depth, core::Tensor& information); #endif +#ifdef BUILD_SYCL_MODULE +void ComputeOdometryResultPointToPlaneSYCL( + const core::Tensor& source_vertex_map, + const core::Tensor& target_vertex_map, + const core::Tensor& target_normal_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta); + +void ComputeOdometryResultIntensitySYCL( + const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& source_intensity, + const core::Tensor& target_intensity, + const core::Tensor& target_intensity_dx, + const core::Tensor& target_intensity_dy, + const core::Tensor& source_vertex_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float intensity_huber_delta); + +void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& source_intensity, + const core::Tensor& target_intensity, + const core::Tensor& target_depth_dx, + const core::Tensor& target_depth_dy, + const core::Tensor& target_intensity_dx, + const core::Tensor& target_intensity_dy, + const core::Tensor& source_vertex_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta, + const float intensity_huber_delta); + +void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& intrinsic, + const core::Tensor& source_to_target, + const float square_dist_thr, + core::Tensor& information); +#endif + } // namespace odometry } // namespace kernel } // namespace pipelines diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp new file mode 100644 index 00000000000..c27b3159609 --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp @@ -0,0 +1,398 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/t/geometry/kernel/GeometryIndexer.h" +#include "open3d/t/geometry/kernel/GeometryMacros.h" +#include "open3d/t/pipelines/kernel/RGBDOdometryImpl.h" +#include "open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h" +#include "open3d/t/pipelines/kernel/TransformationConverter.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { +namespace odometry { + +using t::geometry::kernel::NDArrayIndexer; +using t::geometry::kernel::TransformIndexer; + +static constexpr int kReduceDimOdometry = 29; +static constexpr int kJtJDimOdometry = 21; + +/// Perform a group reduction over a fixed-size private float array using +/// sycl::reduce_over_group, then have work item 0 atomically accumulate each +/// group partial sum into the device-side global buffer. +template +static inline void GroupReduceAndAdd(sycl::nd_item<1> item, + const float (&local_sum)[N], + float *global_sum_ptr) { + auto grp = item.get_group(); + for (int k = 0; k < N; ++k) { + float grp_val = + sycl::reduce_over_group(grp, local_sum[k], sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref(global_sum_ptr[k]) += + grp_val; + } + } +} + +void ComputeOdometryResultPointToPlaneSYCL( + const core::Tensor &source_vertex_map, + const core::Tensor &target_vertex_map, + const core::Tensor &target_normal_map, + const core::Tensor &intrinsics, + const core::Tensor &init_source_to_target, + core::Tensor &delta, + float &inlier_residual, + int &inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta) { + NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); + NDArrayIndexer target_vertex_indexer(target_vertex_map, 2); + NDArrayIndexer target_normal_indexer(target_normal_map, 2); + + core::Device device = source_vertex_map.GetDevice(); + core::Tensor trans = init_source_to_target; + TransformIndexer ti(intrinsics, trans); + + const int64_t rows = source_vertex_indexer.GetShape(0); + const int64_t cols = source_vertex_indexer.GetShape(1); + const int64_t n = rows * cols; + + core::Tensor global_sum = + core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); + float *global_sum_ptr = global_sum.GetDataPtr(); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + float local_sum[kReduceDimOdometry] = {}; + + if (gid < n) { + const int y = gid / cols; + const int x = gid % cols; + + float J[6] = {0}; + float r = 0; + const bool valid = GetJacobianPointToPlane( + x, y, depth_outlier_trunc, + source_vertex_indexer, + target_vertex_indexer, + target_normal_indexer, ti, J, r); + + if (valid) { + const float d_huber = + HuberDeriv(r, depth_huber_delta); + const float r_huber = + HuberLoss(r, depth_huber_delta); + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + local_sum[offset++] += J[i] * J[j]; + } + local_sum[21 + i] += J[i] * d_huber; + } + local_sum[27] += r_huber; + local_sum[28] += 1.0f; + } + } + + GroupReduceAndAdd(item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); + + DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); +} + +void ComputeOdometryResultIntensitySYCL( + const core::Tensor &source_depth, + const core::Tensor &target_depth, + const core::Tensor &source_intensity, + const core::Tensor &target_intensity, + const core::Tensor &target_intensity_dx, + const core::Tensor &target_intensity_dy, + const core::Tensor &source_vertex_map, + const core::Tensor &intrinsics, + const core::Tensor &init_source_to_target, + core::Tensor &delta, + float &inlier_residual, + int &inlier_count, + const float depth_outlier_trunc, + const float intensity_huber_delta) { + NDArrayIndexer source_depth_indexer(source_depth, 2); + NDArrayIndexer target_depth_indexer(target_depth, 2); + NDArrayIndexer source_intensity_indexer(source_intensity, 2); + NDArrayIndexer target_intensity_indexer(target_intensity, 2); + NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2); + NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2); + NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); + + core::Device device = source_vertex_map.GetDevice(); + core::Tensor trans = init_source_to_target; + TransformIndexer ti(intrinsics, trans); + + const int64_t rows = source_vertex_indexer.GetShape(0); + const int64_t cols = source_vertex_indexer.GetShape(1); + const int64_t n = rows * cols; + + core::Tensor global_sum = + core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); + float *global_sum_ptr = global_sum.GetDataPtr(); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + float local_sum[kReduceDimOdometry] = {}; + + if (gid < n) { + const int y = gid / cols; + const int x = gid % cols; + + float J_I[6] = {0}; + float r_I = 0; + const bool valid = GetJacobianIntensity( + x, y, depth_outlier_trunc, + source_depth_indexer, target_depth_indexer, + source_intensity_indexer, + target_intensity_indexer, + target_intensity_dx_indexer, + target_intensity_dy_indexer, + source_vertex_indexer, ti, J_I, r_I); + + if (valid) { + const float d_huber = + HuberDeriv(r_I, intensity_huber_delta); + const float r_huber = + HuberLoss(r_I, intensity_huber_delta); + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + local_sum[offset++] += J_I[i] * J_I[j]; + } + local_sum[21 + i] += J_I[i] * d_huber; + } + local_sum[27] += r_huber; + local_sum[28] += 1.0f; + } + } + + GroupReduceAndAdd(item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); + + DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); +} + +void ComputeOdometryResultHybridSYCL(const core::Tensor &source_depth, + const core::Tensor &target_depth, + const core::Tensor &source_intensity, + const core::Tensor &target_intensity, + const core::Tensor &target_depth_dx, + const core::Tensor &target_depth_dy, + const core::Tensor &target_intensity_dx, + const core::Tensor &target_intensity_dy, + const core::Tensor &source_vertex_map, + const core::Tensor &intrinsics, + const core::Tensor &init_source_to_target, + core::Tensor &delta, + float &inlier_residual, + int &inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta, + const float intensity_huber_delta) { + NDArrayIndexer source_depth_indexer(source_depth, 2); + NDArrayIndexer target_depth_indexer(target_depth, 2); + NDArrayIndexer source_intensity_indexer(source_intensity, 2); + NDArrayIndexer target_intensity_indexer(target_intensity, 2); + NDArrayIndexer target_depth_dx_indexer(target_depth_dx, 2); + NDArrayIndexer target_depth_dy_indexer(target_depth_dy, 2); + NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2); + NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2); + NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); + + core::Device device = source_vertex_map.GetDevice(); + core::Tensor trans = init_source_to_target; + TransformIndexer ti(intrinsics, trans); + + const int64_t rows = source_vertex_indexer.GetShape(0); + const int64_t cols = source_vertex_indexer.GetShape(1); + const int64_t n = rows * cols; + + core::Tensor global_sum = + core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); + float *global_sum_ptr = global_sum.GetDataPtr(); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + float local_sum[kReduceDimOdometry] = {}; + + if (gid < n) { + const int y = gid / cols; + const int x = gid % cols; + + float J_I[6] = {0}, J_D[6] = {0}; + float r_I = 0, r_D = 0; + const bool valid = GetJacobianHybrid( + x, y, depth_outlier_trunc, + source_depth_indexer, target_depth_indexer, + source_intensity_indexer, + target_intensity_indexer, + target_depth_dx_indexer, + target_depth_dy_indexer, + target_intensity_dx_indexer, + target_intensity_dy_indexer, + source_vertex_indexer, ti, J_I, J_D, r_I, + r_D); + + if (valid) { + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + local_sum[offset++] += + J_I[i] * J_I[j] + + J_D[i] * J_D[j]; + } + local_sum[21 + i] += + J_I[i] * + HuberDeriv( + r_I, + intensity_huber_delta) + + J_D[i] * + HuberDeriv( + r_D, + depth_huber_delta); + } + local_sum[27] += + HuberLoss(r_I, intensity_huber_delta) + + HuberLoss(r_D, depth_huber_delta); + local_sum[28] += 1.0f; + } + } + + GroupReduceAndAdd(item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); + + DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); +} + +void ComputeOdometryInformationMatrixSYCL(const core::Tensor &source_vertex_map, + const core::Tensor &target_vertex_map, + const core::Tensor &intrinsic, + const core::Tensor &source_to_target, + const float square_dist_thr, + core::Tensor &information) { + NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); + NDArrayIndexer target_vertex_indexer(target_vertex_map, 2); + + core::Device device = source_vertex_map.GetDevice(); + core::Tensor trans = source_to_target; + TransformIndexer ti(intrinsic, trans); + + const int64_t rows = source_vertex_indexer.GetShape(0); + const int64_t cols = source_vertex_indexer.GetShape(1); + const int64_t n = rows * cols; + + core::Tensor global_sum = + core::Tensor::Zeros({kJtJDimOdometry}, core::Float32, device); + float *global_sum_ptr = global_sum.GetDataPtr(); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + float local_sum[kJtJDimOdometry] = {}; + + if (gid < n) { + const int y = gid / cols; + const int x = gid % cols; + + float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; + float rx = 0, ry = 0, rz = 0; + const bool valid = GetJacobianPointToPoint( + x, y, square_dist_thr, + source_vertex_indexer, + target_vertex_indexer, ti, J_x, J_y, J_z, + rx, ry, rz); + + if (valid) { + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + local_sum[offset++] += + J_x[i] * J_x[j] + + J_y[i] * J_y[j] + + J_z[i] * J_z[j]; + } + } + } + } + + GroupReduceAndAdd(item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); + + const core::Device host(core::Device("CPU:0")); + information = core::Tensor::Empty({6, 6}, core::Float64, host); + global_sum = global_sum.To(host, core::Float64); + + double *info_ptr = information.GetDataPtr(); + double *reduction_ptr = global_sum.GetDataPtr(); + for (int j = 0; j < 6; j++) { + const int64_t reduction_idx = ((j * (j + 1)) / 2); + for (int k = 0; k <= j; k++) { + info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k]; + info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k]; + } + } +} + +} // namespace odometry +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/Registration.cpp b/cpp/open3d/t/pipelines/kernel/Registration.cpp index 326671431f3..95ccddec5c2 100644 --- a/cpp/open3d/t/pipelines/kernel/Registration.cpp +++ b/cpp/open3d/t/pipelines/kernel/Registration.cpp @@ -41,6 +41,16 @@ core::Tensor ComputePosePointToPlane(const core::Tensor &source_points, target_points.Contiguous(), target_normals.Contiguous(), correspondence_indices.Contiguous(), pose, residual, inlier_count, source_points.GetDtype(), device, kernel); + } else if (source_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputePosePointToPlaneSYCL( + source_points.Contiguous(), target_points.Contiguous(), + target_normals.Contiguous(), + correspondence_indices.Contiguous(), pose, residual, + inlier_count, source_points.GetDtype(), device, kernel); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -85,6 +95,18 @@ core::Tensor ComputePoseColoredICP(const core::Tensor &source_points, correspondence_indices.Contiguous(), pose, residual, inlier_count, source_points.GetDtype(), device, kernel, lambda_geometric); + } else if (source_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputePoseColoredICPSYCL( + source_points.Contiguous(), source_colors.Contiguous(), + target_points.Contiguous(), target_normals.Contiguous(), + target_colors.Contiguous(), target_color_gradients.Contiguous(), + correspondence_indices.Contiguous(), pose, residual, + inlier_count, source_points.GetDtype(), device, kernel, + lambda_geometric); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -191,6 +213,21 @@ core::Tensor ComputePoseDopplerICP( v_v_in_V.Contiguous(), period, reject_outliers, doppler_outlier_threshold, kernel_geometric, kernel_doppler, lambda_doppler); + } else if (device_type == core::Device::DeviceType::SYCL) { +#ifdef BUILD_SYCL_MODULE + ComputePoseDopplerICPSYCL( + source_points.Contiguous(), source_dopplers.Contiguous(), + source_directions.Contiguous(), target_points.Contiguous(), + target_normals.Contiguous(), + correspondence_indices.Contiguous(), output_pose, residual, + inlier_count, dtype, device, R_S_to_V.Contiguous(), + r_v_to_s_in_V.Contiguous(), w_v_in_V.Contiguous(), + v_v_in_V.Contiguous(), period, reject_outliers, + doppler_outlier_threshold, kernel_geometric, kernel_doppler, + lambda_doppler); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -270,6 +307,51 @@ std::tuple ComputeRtPointToPoint( t = mean_t.Reshape({-1}) - R.Matmul(mean_s.T()).Reshape({-1}); #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (source_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + // Use tensor operations which work on SYCL devices. + core::Tensor valid = correspondence_indices.Ne(-1).Reshape({-1}); + + if (valid.GetLength() == 0) { + utility::LogError("No valid correspondence present."); + } + + core::Tensor source_indices = + core::Tensor::Arange(0, source_points.GetShape()[0], 1, + core::Int64, device) + .IndexGet({valid}); + core::Tensor target_indices = + correspondence_indices.IndexGet({valid}).Reshape({-1}); + + inlier_count = source_indices.GetLength(); + + core::Tensor source_select = source_points.IndexGet({source_indices}); + core::Tensor target_select = target_points.IndexGet({target_indices}); + + core::Tensor mean_s = source_select.Mean({0}, true); + core::Tensor mean_t = target_select.Mean({0}, true); + + core::Device host("CPU:0"); + core::Tensor Sxy = (target_select - mean_t) + .T() + .Matmul(source_select - mean_s) + .Div_(static_cast(inlier_count)) + .To(host, core::Float64); + + mean_s = mean_s.To(host, core::Float64); + mean_t = mean_t.To(host, core::Float64); + + core::Tensor U, D, VT; + std::tie(U, D, VT) = Sxy.SVD(); + core::Tensor S = core::Tensor::Eye(3, core::Float64, host); + if (U.Det() * (VT.T()).Det() < 0) { + S[-1][-1] = -1; + } + R = U.Matmul(S.Matmul(VT)); + t = mean_t.Reshape({-1}) - R.Matmul(mean_s.T()).Reshape({-1}); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device."); @@ -294,6 +376,14 @@ core::Tensor ComputeInformationMatrix( CUDA_CALL(ComputeInformationMatrixCUDA, target_points.Contiguous(), correspondence_indices.Contiguous(), information_matrix, target_points.GetDtype(), device); + } else if (target_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeInformationMatrixSYCL( + target_points.Contiguous(), correspondence_indices.Contiguous(), + information_matrix, target_points.GetDtype(), device); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h index 3f08ca4c811..94acdf773b3 100644 --- a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h @@ -149,6 +149,63 @@ void ComputeInformationMatrixCUDA(const core::Tensor &target_points, const core::Device &device); #endif +#ifdef BUILD_SYCL_MODULE +void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel); + +void ComputePoseColoredICPSYCL(const core::Tensor &source_points, + const core::Tensor &source_colors, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &target_colors, + const core::Tensor &target_color_gradients, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel, + const double &lambda_geometric); + +void ComputePoseDopplerICPSYCL( + const core::Tensor &source_points, + const core::Tensor &source_dopplers, + const core::Tensor &source_directions, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &output_pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const core::Tensor &R_S_to_V, + const core::Tensor &r_v_to_s_in_V, + const core::Tensor &w_v_in_V, + const core::Tensor &v_v_in_V, + const double period, + const bool reject_dynamic_outliers, + const double doppler_outlier_threshold, + const registration::RobustKernel &kernel_geometric, + const registration::RobustKernel &kernel_doppler, + const double lambda_doppler); + +void ComputeInformationMatrixSYCL(const core::Tensor &target_points, + const core::Tensor &correspondence_indices, + core::Tensor &information_matrix, + const core::Dtype &dtype, + const core::Device &device); +#endif + template OPEN3D_HOST_DEVICE inline bool GetJacobianPointToPlane( int64_t workload_idx, diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp new file mode 100644 index 00000000000..c7503191fdb --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp @@ -0,0 +1,554 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/t/pipelines/kernel/RegistrationImpl.h" +#include "open3d/t/pipelines/kernel/TransformationConverter.h" +#include "open3d/t/pipelines/registration/RobustKernel.h" +#include "open3d/t/pipelines/registration/RobustKernelImpl.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { + +static constexpr int kReduceDim = + 29; // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier) + +/// Perform a group reduction over a fixed-size private array using +/// sycl::reduce_over_group, then have work item 0 atomically accumulate each +/// group partial sum into the device-side global buffer. +template +static inline void GroupReduceAndAdd(sycl::nd_item<1> item, + const scalar_t (&local_sum)[N], + scalar_t *global_sum_ptr) { + auto grp = item.get_group(); + for (int k = 0; k < N; ++k) { + scalar_t grp_val = sycl::reduce_over_group(grp, local_sum[k], + sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref(global_sum_ptr[k]) += + grp_val; + } + } +} + +void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel) { + const int n = source_points.GetLength(); + + core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + scalar_t *global_sum_ptr = global_sum.GetDataPtr(); + + DISPATCH_ROBUST_KERNEL_FUNCTION( + kernel.type_, scalar_t, kernel.scaling_parameter_, + kernel.shape_parameter_, [&]() { + const scalar_t *source_points_ptr = + source_points.GetDataPtr(); + const scalar_t *target_points_ptr = + target_points.GetDataPtr(); + const scalar_t *target_normals_ptr = + target_normals.GetDataPtr(); + const int64_t *correspondence_indices_ptr = + correspondence_indices.GetDataPtr(); + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + scalar_t local_sum[kReduceDim] = {}; + + if (gid < n) { + scalar_t J_ij[6] = {0}; + scalar_t r = 0; + const bool valid = GetJacobianPointToPlane< + scalar_t>( + gid, source_points_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + J_ij, r); + + if (valid) { + const scalar_t w = + GetWeightFromRobustKernel( + r); + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; + ++k) { + local_sum[i++] += + J_ij[j] * w * + J_ij[k]; + } + local_sum[21 + j] += + J_ij[j] * w * r; + } + local_sum[27] += r; + local_sum[28] += scalar_t(1); + } + } + + GroupReduceAndAdd( + item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); + }); + }); + + DecodeAndSolve6x6(global_sum, pose, residual, inlier_count); +} + +void ComputePoseColoredICPSYCL(const core::Tensor &source_points, + const core::Tensor &source_colors, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &target_colors, + const core::Tensor &target_color_gradients, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel, + const double &lambda_geometric) { + const int n = source_points.GetLength(); + + core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + const scalar_t sqrt_lambda_geometric = + static_cast(sqrt(lambda_geometric)); + const scalar_t sqrt_lambda_photometric = + static_cast(sqrt(1.0 - lambda_geometric)); + scalar_t *global_sum_ptr = global_sum.GetDataPtr(); + + DISPATCH_ROBUST_KERNEL_FUNCTION( + kernel.type_, scalar_t, kernel.scaling_parameter_, + kernel.shape_parameter_, [&]() { + const scalar_t *source_points_ptr = + source_points.GetDataPtr(); + const scalar_t *source_colors_ptr = + source_colors.GetDataPtr(); + const scalar_t *target_points_ptr = + target_points.GetDataPtr(); + const scalar_t *target_normals_ptr = + target_normals.GetDataPtr(); + const scalar_t *target_colors_ptr = + target_colors.GetDataPtr(); + const scalar_t *target_color_gradients_ptr = + target_color_gradients.GetDataPtr(); + const int64_t *correspondence_indices_ptr = + correspondence_indices.GetDataPtr(); + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + scalar_t local_sum[kReduceDim] = {}; + + if (gid < n) { + scalar_t J_G[6] = {0}, + J_I[6] = {0}; + scalar_t r_G = 0, r_I = 0; + + const bool valid = GetJacobianColoredICP< + scalar_t>( + gid, source_points_ptr, + source_colors_ptr, + target_points_ptr, + target_normals_ptr, + target_colors_ptr, + target_color_gradients_ptr, + correspondence_indices_ptr, + sqrt_lambda_geometric, + sqrt_lambda_photometric, + J_G, J_I, r_G, r_I); + + if (valid) { + const scalar_t w_G = + GetWeightFromRobustKernel( + r_G); + const scalar_t w_I = + GetWeightFromRobustKernel( + r_I); + + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; + ++k) { + local_sum[i++] += + J_G[j] * w_G * + J_G[k] + + J_I[j] * w_I * + J_I[k]; + } + local_sum[21 + j] += + J_G[j] * w_G * + r_G + + J_I[j] * w_I * r_I; + } + local_sum[27] += + r_G * r_G + r_I * r_I; + local_sum[28] += scalar_t(1); + } + } + + GroupReduceAndAdd( + item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); + }); + }); + + DecodeAndSolve6x6(global_sum, pose, residual, inlier_count); +} + +void ComputePoseDopplerICPSYCL( + const core::Tensor &source_points, + const core::Tensor &source_dopplers, + const core::Tensor &source_directions, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &output_pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const core::Tensor &R_S_to_V, + const core::Tensor &r_v_to_s_in_V, + const core::Tensor &w_v_in_V, + const core::Tensor &v_v_in_V, + const double period, + const bool reject_dynamic_outliers, + const double doppler_outlier_threshold, + const registration::RobustKernel &kernel_geometric, + const registration::RobustKernel &kernel_doppler, + const double lambda_doppler) { + const int n = source_points.GetLength(); + + core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); + core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, + [&]() { + const scalar_t sqrt_lambda_geometric = + sqrt(1.0 - + static_cast( + lambda_doppler)); + const scalar_t sqrt_lambda_doppler = + sqrt(lambda_doppler); + const scalar_t + sqrt_lambda_doppler_by_dt = + sqrt_lambda_doppler / + static_cast( + period); + + // Pre-compute v_s_in_S using a + // single-task kernel. + { + const scalar_t *R_S_to_V_ptr = + R_S_to_V.GetDataPtr< + scalar_t>(); + const scalar_t *r_v_to_s_in_V_ptr = + r_v_to_s_in_V.GetDataPtr< + scalar_t>(); + const scalar_t *w_v_in_V_ptr = + w_v_in_V.GetDataPtr< + scalar_t>(); + const scalar_t *v_v_in_V_ptr = + v_v_in_V.GetDataPtr< + scalar_t>(); + scalar_t *v_s_in_S_ptr = + v_s_in_S.GetDataPtr< + scalar_t>(); + queue.single_task([=]() { + PreComputeForDopplerICP( + R_S_to_V_ptr, + r_v_to_s_in_V_ptr, + w_v_in_V_ptr, + v_v_in_V_ptr, + v_s_in_S_ptr); + }).wait_and_throw(); + } + + scalar_t *global_sum_ptr = + global_sum.GetDataPtr< + scalar_t>(); + + DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(scalar_t, + kernel_geometric + .type_, + kernel_geometric + .scaling_parameter_, + kernel_doppler + .type_, + kernel_doppler + .scaling_parameter_, + [&]() { + const scalar_t *source_points_ptr = + source_points + .GetDataPtr< + scalar_t>(); + const scalar_t *source_dopplers_ptr = + source_dopplers + .GetDataPtr< + scalar_t>(); + const scalar_t *source_directions_ptr = + source_directions + .GetDataPtr< + scalar_t>(); + const scalar_t *target_points_ptr = + target_points + .GetDataPtr< + scalar_t>(); + const scalar_t *target_normals_ptr = + target_normals + .GetDataPtr< + scalar_t>(); + const int64_t *correspondence_indices_ptr = + correspondence_indices + .GetDataPtr< + int64_t>(); + const scalar_t *R_S_to_V_ptr = + R_S_to_V.GetDataPtr< + scalar_t>(); + const scalar_t *r_v_to_s_in_V_ptr = + r_v_to_s_in_V + .GetDataPtr< + scalar_t>(); + const scalar_t *v_s_in_S_ptr = + v_s_in_S.GetDataPtr< + scalar_t>(); + + queue + .submit([&](sycl::handler + &cgh) { + cgh.parallel_for( + sycl::nd_range< + 1>{ + num_groups * + wgs, + wgs}, + [=](sycl::nd_item< + 1> item) { + const int gid = + item.get_global_id( + 0); + scalar_t local_sum + [kReduceDim] = + {}; + + if (gid < + n) { + scalar_t J_G + [6] = {0}, + J_D[6] = { + 0}; + scalar_t + r_G = 0, + r_D = 0; + + const bool valid = GetJacobianDopplerICP< + scalar_t>( + gid, + source_points_ptr, + source_dopplers_ptr, + source_directions_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + R_S_to_V_ptr, + r_v_to_s_in_V_ptr, + v_s_in_S_ptr, + reject_dynamic_outliers, + static_cast< + scalar_t>( + doppler_outlier_threshold), + sqrt_lambda_geometric, + sqrt_lambda_doppler, + sqrt_lambda_doppler_by_dt, + J_G, + J_D, + r_G, + r_D); + + if (valid) { + const scalar_t + w_G = GetWeightFromRobustKernelFirst( + r_G); + const scalar_t + w_D = GetWeightFromRobustKernelSecond( + r_D); + + int i = 0; + for (int j = 0; + j < + 6; + ++j) { + for (int k = 0; + k <= + j; + ++k) { + local_sum + [i++] += + J_G[j] * + w_G * + J_G[k] + + J_D[j] * + w_D * + J_D[k]; + } + local_sum + [21 + + j] += + J_G[j] * + w_G * + r_G + + J_D[j] * + w_D * + r_D; + } + local_sum + [27] += + r_G * r_G + + r_D * r_D; + local_sum[28] += scalar_t( + 1); + } + } + + GroupReduceAndAdd< + kReduceDim, + scalar_t>( + item, + local_sum, + global_sum_ptr); + }); + }) + .wait_and_throw(); + }); + }); + + DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count); +} + +void ComputeInformationMatrixSYCL(const core::Tensor &target_points, + const core::Tensor &correspondence_indices, + core::Tensor &information_matrix, + const core::Dtype &dtype, + const core::Device &device) { + const int n = correspondence_indices.GetLength(); + + core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + scalar_t *global_sum_ptr = global_sum.GetDataPtr(); + const scalar_t *target_points_ptr = + target_points.GetDataPtr(); + const int64_t *correspondence_indices_ptr = + correspondence_indices.GetDataPtr(); + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + scalar_t local_sum[21] = {}; + + if (gid < n) { + scalar_t J_x[6] = {0}, J_y[6] = {0}, + J_z[6] = {0}; + const bool valid = + GetInformationJacobians( + gid, target_points_ptr, + correspondence_indices_ptr, + J_x, J_y, J_z); + + if (valid) { + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; ++k) { + local_sum[i++] += J_x[j] * J_x[k] + + J_y[j] * J_y[k] + + J_z[j] * J_z[k]; + } + } + } + } + + GroupReduceAndAdd<21, scalar_t>(item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); + + const core::Device host(core::Device("CPU:0")); + core::Tensor global_sum_cpu = global_sum.To(host, core::Float64); + double *sum_ptr = global_sum_cpu.GetDataPtr(); + + // Information matrix is on CPU of type Float64. + double *GTG_ptr = information_matrix.GetDataPtr(); + + int i = 0; + for (int j = 0; j < 6; j++) { + for (int k = 0; k <= j; k++) { + GTG_ptr[j * 6 + k] = GTG_ptr[k * 6 + j] = sum_ptr[i]; + ++i; + } + } + }); +} + +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp index c181ade770e..5526e2f1f59 100644 --- a/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp +++ b/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp @@ -63,6 +63,13 @@ static void PoseToTransformationDevice( PoseToTransformationCUDA(transformation_ptr, pose_ptr); #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (device_type == core::Device::DeviceType::SYCL) { +#ifdef BUILD_SYCL_MODULE + PoseToTransformationSYCL(transformation_ptr, pose_ptr, + transformation.GetDevice()); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device."); @@ -109,6 +116,13 @@ static void TransformationToPoseDevice( TransformationToPoseCUDA(pose_ptr, transformation_ptr); #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (device_type == core::Device::DeviceType::SYCL) { +#ifdef BUILD_SYCL_MODULE + TransformationToPoseSYCL(pose_ptr, transformation_ptr, + pose.GetDevice()); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device."); diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h index e75484ce3f5..6ee52ee8ab5 100644 --- a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h +++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h @@ -77,6 +77,26 @@ void TransformationToPoseCUDA(scalar_t *pose_ptr, const scalar_t *transformation_ptr); #endif +#ifdef BUILD_SYCL_MODULE +/// \brief Helper function for PoseToTransformationSYCL. +/// Do not call this independently, as it only sets the transformation part +/// in transformation matrix, using the Pose, the rest is set in +/// the parent function PoseToTransformation. +template +void PoseToTransformationSYCL(scalar_t *transformation_ptr, + const scalar_t *pose_ptr, + const core::Device &device); + +/// \brief Helper function for TransformationToPoseSYCL. +/// Do not call this independently, as it only sets the rotation part in the +/// pose, using the Transformation, the rest is set in the parent function +/// TransformationToPose. +template +void TransformationToPoseSYCL(scalar_t *pose_ptr, + const scalar_t *transformation_ptr, + const core::Device &device); +#endif + } // namespace kernel } // namespace pipelines } // namespace t diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp new file mode 100644 index 00000000000..13eb4490465 --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp @@ -0,0 +1,56 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Device.h" +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/t/pipelines/kernel/TransformationConverterImpl.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { + +template +void PoseToTransformationSYCL(scalar_t *transformation_ptr, + const scalar_t *pose_ptr, + const core::Device &device) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.single_task([=]() { + PoseToTransformationImpl(transformation_ptr, pose_ptr); + }).wait_and_throw(); +} + +template +void TransformationToPoseSYCL(scalar_t *pose_ptr, + const scalar_t *transformation_ptr, + const core::Device &device) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.single_task([=]() { + TransformationToPoseImpl(pose_ptr, transformation_ptr); + }).wait_and_throw(); +} + +template void PoseToTransformationSYCL(float *, + const float *, + const core::Device &); +template void PoseToTransformationSYCL(double *, + const double *, + const core::Device &); +template void TransformationToPoseSYCL(float *, + const float *, + const core::Device &); +template void TransformationToPoseSYCL(double *, + const double *, + const core::Device &); + +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/pybind/core/nns/nearest_neighbor_search.cpp b/cpp/pybind/core/nns/nearest_neighbor_search.cpp index 3bb647592a0..99394d8681a 100644 --- a/cpp/pybind/core/nns/nearest_neighbor_search.cpp +++ b/cpp/pybind/core/nns/nearest_neighbor_search.cpp @@ -84,7 +84,7 @@ This function needs to be called once before performing search operations. Args: radius (float, optional): Radius value for fixed-radius search. Required - for GPU fixed radius index. + for CUDA and SYCL fixed radius indices. Returns: True on success. @@ -115,7 +115,7 @@ This function needs to be called once before performing search operations. Args: radius (float, optional): Radius value for hybrid search. Required - for GPU hybrid index. + for CUDA and SYCL hybrid indices. Returns: True on success. @@ -130,7 +130,8 @@ This function needs to be called once before performing search operations. To use knn_search initialize the index using knn_index before calling this function. Args: - query_points (open3d.core.Tensor): Query points with shape {n, d}. + query_points (open3d.core.Tensor): Query points with shape {n, d}. CPU, + CUDA, and SYCL devices are supported. knn (int): Number of neighbors to search per query point. Example: @@ -179,6 +180,7 @@ This function needs to be called once before performing search operations. Args: query_points (open3d.core.Tensor): Query points with shape {n, d}. + CPU, CUDA, and SYCL devices are supported. radius (float): Radius value for fixed-radius search. Note that this parameter can differ from the radius used to initialize the index for convenience, which may cause the index to be rebuilt for GPU @@ -234,7 +236,8 @@ This function needs to be called once before performing search operations. To use multi_radius_search initialize the index using multi_radius_index before calling this function. Args: - query_points (open3d.core.Tensor): Query points with shape {n, d}. + query_points (open3d.core.Tensor): Query points with shape {n, d}. Only + CPU tensors are supported. radii (open3d.core.Tensor): Radii of query points. Each query point has one radius. Returns: @@ -292,7 +295,8 @@ Hybrid search behaves similarly to fixed-radius search, but with a maximum numbe To use hybrid_search initialize the index using hybrid_index before calling this function. Args: - query_points (open3d.core.Tensor): Query points with shape {n, d}. + query_points (open3d.core.Tensor): Query points with shape {n, d}. CPU, + CUDA, and SYCL devices are supported. radius (float): Radius value for hybrid search. max_knn (int): Maximum number of neighbor to search per query. diff --git a/cpp/tests/core/CMakeLists.txt b/cpp/tests/core/CMakeLists.txt index a1c4826d0fa..1d71b53455c 100644 --- a/cpp/tests/core/CMakeLists.txt +++ b/cpp/tests/core/CMakeLists.txt @@ -22,10 +22,15 @@ target_sources(tests PRIVATE SYCLUtils.cpp ) +if (BUILD_CUDA_MODULE OR BUILD_SYCL_MODULE) + target_sources(tests PRIVATE + KnnIndex.cpp + ) +endif() + if (BUILD_CUDA_MODULE) target_sources(tests PRIVATE FixedRadiusIndex.cpp - KnnIndex.cpp ParallelFor.cu ) endif() @@ -38,10 +43,8 @@ if (BUILD_ISPC_MODULE) endif() # TODO: cmake does not currently build this test! -# if (BUILD_SYCL_MODULE) -# target_sources(tests PRIVATE -# ParallelForSYCL.cpp -# ) -# set_source_files_properties(ParallelForSYCL.cpp PROPERTIES -# COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64_gen") -# endif() \ No newline at end of file +#if (BUILD_SYCL_MODULE) +# open3d_sycl_target_sources(tests PRIVATE +# ParallelForSYCL.cpp +# ) +#endif() \ No newline at end of file diff --git a/cpp/tests/core/KnnIndex.cpp b/cpp/tests/core/KnnIndex.cpp index 1ceb32d4835..a5d63beebd1 100644 --- a/cpp/tests/core/KnnIndex.cpp +++ b/cpp/tests/core/KnnIndex.cpp @@ -24,9 +24,18 @@ using namespace std; namespace open3d { namespace tests { -TEST(KnnIndex, KnnSearch) { +class KnnIndexPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + KnnIndex, + KnnIndexPermuteDevices, + testing::ValuesIn(KnnIndexPermuteDevices::TestCases())); + +TEST_P(KnnIndexPermuteDevices, KnnSearch) { // Define test data. - core::Device device = core::Device("CUDA:0"); + core::Device device = GetParam(); + if (device.IsCPU()) { + GTEST_SKIP() << "KnnIndex does not support CPU device."; + } core::Tensor dataset_points = core::Tensor::Init({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.1}, {0.0, 0.0, 0.2}, @@ -149,9 +158,12 @@ TEST(KnnIndex, KnnSearch) { EXPECT_TRUE(distances.AllClose(gt_distances)); } -TEST(KnnIndex, KnnSearchHighdim) { +TEST_P(KnnIndexPermuteDevices, KnnSearchHighdim) { // Define test data. - core::Device device = core::Device("CUDA:0"); + core::Device device = GetParam(); + if (device.IsCPU()) { + GTEST_SKIP() << "KnnIndex does not support CPU device."; + } core::Tensor dataset_points = core::Tensor::Init({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.1}, {0.0, 0.0, 0.2}, @@ -231,9 +243,12 @@ TEST(KnnIndex, KnnSearchHighdim) { EXPECT_TRUE(distances64.AllClose(gt_distances)); } -TEST(KnnIndex, KnnSearchBatch) { +TEST_P(KnnIndexPermuteDevices, KnnSearchBatch) { // Define test data. - core::Device device = core::Device("CUDA:0"); + core::Device device = GetParam(); + if (device.IsCPU()) { + GTEST_SKIP() << "KnnIndex does not support CPU device."; + } core::Tensor dataset_points = core::Tensor::Init( {{0.719, 0.128, 0.431}, {0.764, 0.970, 0.678}, {0.692, 0.786, 0.211}, {0.692, 0.969, 0.942}, @@ -303,8 +318,9 @@ TEST(KnnIndex, KnnSearchBatch) { index32.SearchKnn(query_points, queries_row_splits, 3); EXPECT_EQ(indices.GetShape(), shape); EXPECT_EQ(distances.GetShape(), shape); - EXPECT_TRUE(indices.AllClose(gt_indices)); - EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)); + EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString(); + EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) + << distances.ToString(); // int64 // Set up Knn index. @@ -327,8 +343,9 @@ TEST(KnnIndex, KnnSearchBatch) { index64.SearchKnn(query_points, queries_row_splits, 3); EXPECT_EQ(indices.GetShape(), shape); EXPECT_EQ(distances.GetShape(), shape); - EXPECT_TRUE(indices.AllClose(gt_indices)); - EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)); + EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString(); + EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) + << distances.ToString(); } } // namespace tests diff --git a/cpp/tests/core/NearestNeighborSearch.cpp b/cpp/tests/core/NearestNeighborSearch.cpp index d282e8ecd9a..547d3829a06 100644 --- a/cpp/tests/core/NearestNeighborSearch.cpp +++ b/cpp/tests/core/NearestNeighborSearch.cpp @@ -22,7 +22,7 @@ namespace open3d { namespace tests { -class NNSPermuteDevices : public PermuteDevices {}; +class NNSPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(NearestNeighborSearch, NNSPermuteDevices, testing::ValuesIn(PermuteDevices::TestCases())); diff --git a/cpp/tests/t/pipelines/TransformationConverter.cpp b/cpp/tests/t/pipelines/TransformationConverter.cpp index 5d8bd0d8434..a4c092c62d6 100644 --- a/cpp/tests/t/pipelines/TransformationConverter.cpp +++ b/cpp/tests/t/pipelines/TransformationConverter.cpp @@ -14,10 +14,11 @@ namespace open3d { namespace tests { -class TransformationConverterPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TransformationConverter, - TransformationConverterPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TransformationConverterPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TransformationConverter, + TransformationConverterPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(TransformationConverterPermuteDevices, RtToTransformation) { core::Device device = GetParam(); diff --git a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp index 3bb7ee36142..64d2a97d4a7 100644 --- a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp +++ b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp @@ -21,10 +21,11 @@ namespace open3d { namespace tests { -class OdometryPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(Odometry, - OdometryPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class OdometryPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + Odometry, + OdometryPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); core::Tensor CreateIntrisicTensor() { camera::PinholeCameraIntrinsic intrinsic = camera::PinholeCameraIntrinsic( diff --git a/cpp/tests/t/pipelines/registration/Feature.cpp b/cpp/tests/t/pipelines/registration/Feature.cpp index a3741a3ed8a..7cb5f27ea8c 100644 --- a/cpp/tests/t/pipelines/registration/Feature.cpp +++ b/cpp/tests/t/pipelines/registration/Feature.cpp @@ -20,10 +20,11 @@ namespace open3d { namespace tests { -class FeaturePermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(Feature, - FeaturePermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class FeaturePermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + Feature, + FeaturePermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(FeaturePermuteDevices, SelectByIndex) { core::Device device = GetParam(); diff --git a/cpp/tests/t/pipelines/registration/Registration.cpp b/cpp/tests/t/pipelines/registration/Registration.cpp index d823394c6bf..d3d0544e27c 100644 --- a/cpp/tests/t/pipelines/registration/Registration.cpp +++ b/cpp/tests/t/pipelines/registration/Registration.cpp @@ -26,10 +26,11 @@ namespace l_reg = open3d::pipelines::registration; namespace open3d { namespace tests { -class RegistrationPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(Registration, - RegistrationPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class RegistrationPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + Registration, + RegistrationPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(RegistrationPermuteDevices, ICPConvergenceCriteriaConstructor) { // Constructor. diff --git a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp index 2daca846d0a..afc7f40e7a8 100644 --- a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp +++ b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp @@ -15,10 +15,11 @@ namespace open3d { namespace tests { -class TransformationEstimationPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TransformationEstimation, - TransformationEstimationPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TransformationEstimationPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TransformationEstimation, + TransformationEstimationPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); static std:: tuple diff --git a/cpp/tests/t/pipelines/slac/ControlGrid.cpp b/cpp/tests/t/pipelines/slac/ControlGrid.cpp index 98bce9d5ef2..7f716f553ee 100644 --- a/cpp/tests/t/pipelines/slac/ControlGrid.cpp +++ b/cpp/tests/t/pipelines/slac/ControlGrid.cpp @@ -24,10 +24,11 @@ static t::geometry::PointCloud CreateTPCDFromFile( return t::geometry::PointCloud::FromLegacy(*pcd, core::Float32, device); } -class ControlGridPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(ControlGrid, - ControlGridPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class ControlGridPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + ControlGrid, + ControlGridPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); // TODO(wei): more well-designed test cases TEST_P(ControlGridPermuteDevices, Touch) { diff --git a/cpp/tests/t/pipelines/slac/SLAC.cpp b/cpp/tests/t/pipelines/slac/SLAC.cpp index bc31cb31044..00926a38741 100644 --- a/cpp/tests/t/pipelines/slac/SLAC.cpp +++ b/cpp/tests/t/pipelines/slac/SLAC.cpp @@ -27,10 +27,11 @@ namespace open3d { namespace tests { -class SLACPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(SLAC, - SLACPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class SLACPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + SLAC, + SLACPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); // PointCloud is similar if fitness is higher and rmse is lower than tolerance // threshold. diff --git a/python/test/core/test_nn.py b/python/test/core/test_nn.py index 48b7d2682cd..b5376fec105 100644 --- a/python/test/core/test_nn.py +++ b/python/test/core/test_nn.py @@ -19,7 +19,7 @@ np.random.seed(0) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_knn_index(device): dtype = o3c.float32 @@ -34,7 +34,7 @@ def test_knn_index(device): assert nns.multi_radius_index() -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_knn_search(device): dtype = o3c.float32 @@ -107,7 +107,7 @@ def test_knn_search(device): rtol=1e-5) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64]) def test_fixed_radius_search(device, dtype): dataset_points = o3c.Tensor(