From f965404b4f0f0b0b3535c141e02ff78c4117b14b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Feb 2026 07:49:39 +0000 Subject: [PATCH 01/14] Initial plan From a5f1fa3dc756e1ab2b15234c0a5beb7caa5428b7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Feb 2026 13:01:06 +0000 Subject: [PATCH 02/14] Add SYCL kernels for t/pipelines/kernel: Feature, FillInLinearSystem, Registration, RGBDOdometry, TransformationConverter Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com> --- cpp/open3d/t/pipelines/kernel/CMakeLists.txt | 10 + cpp/open3d/t/pipelines/kernel/Feature.cpp | 11 +- cpp/open3d/t/pipelines/kernel/Feature.h | 13 + cpp/open3d/t/pipelines/kernel/FeatureImpl.h | 2 + cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp | 228 +++++++++ .../t/pipelines/kernel/FillInLinearSystem.cpp | 26 + .../t/pipelines/kernel/FillInLinearSystem.h | 41 ++ .../pipelines/kernel/FillInLinearSystemImpl.h | 2 + .../kernel/FillInLinearSystemSYCL.cpp | 456 +++++++++++++++++ .../t/pipelines/kernel/RGBDOdometry.cpp | 39 ++ .../t/pipelines/kernel/RGBDOdometryImpl.h | 55 +++ .../t/pipelines/kernel/RGBDOdometrySYCL.cpp | 366 ++++++++++++++ .../t/pipelines/kernel/Registration.cpp | 91 ++++ .../t/pipelines/kernel/RegistrationImpl.h | 57 +++ .../t/pipelines/kernel/RegistrationSYCL.cpp | 462 ++++++++++++++++++ .../kernel/TransformationConverter.cpp | 14 + .../kernel/TransformationConverterImpl.h | 20 + .../kernel/TransformationConverterSYCL.cpp | 56 +++ 18 files changed, 1948 insertions(+), 1 deletion(-) create mode 100644 cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp create mode 100644 cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp create mode 100644 cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp create mode 100644 cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp create mode 100644 cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp diff --git a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt index a766715e6e7..2a9478b4be9 100644 --- a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt +++ b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt @@ -22,6 +22,16 @@ if (BUILD_CUDA_MODULE) ) endif() +if (BUILD_SYCL_MODULE) + target_sources(tpipelines_kernel PRIVATE + RegistrationSYCL.cpp + FillInLinearSystemSYCL.cpp + RGBDOdometrySYCL.cpp + TransformationConverterSYCL.cpp + FeatureSYCL.cpp + ) +endif() + open3d_show_and_abort_on_warning(tpipelines_kernel) open3d_set_global_properties(tpipelines_kernel) # The kernels are used in the unit tests, so they cannot be hidden for now. diff --git a/cpp/open3d/t/pipelines/kernel/Feature.cpp b/cpp/open3d/t/pipelines/kernel/Feature.cpp index c73573054aa..4677396f1d7 100644 --- a/cpp/open3d/t/pipelines/kernel/Feature.cpp +++ b/cpp/open3d/t/pipelines/kernel/Feature.cpp @@ -44,10 +44,19 @@ void ComputeFPFHFeature( if (points_d.IsCPU()) { ComputeFPFHFeatureCPU(points_d, normals_d, indices, distance2, counts_d, fpfhs, mask, map_info_idx_to_point_idx); - } else { + } else if (points_d.IsCUDA()) { core::CUDAScopedDevice scoped_device(points.GetDevice()); CUDA_CALL(ComputeFPFHFeatureCUDA, points_d, normals_d, indices, distance2, counts_d, fpfhs, mask, map_info_idx_to_point_idx); + } else if (points_d.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeFPFHFeatureSYCL(points_d, normals_d, indices, distance2, + counts_d, fpfhs, mask, map_info_idx_to_point_idx); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif + } else { + utility::LogError("Unimplemented device."); } utility::LogDebug( "[ComputeFPFHFeature] Computed {:d} features from " diff --git a/cpp/open3d/t/pipelines/kernel/Feature.h b/cpp/open3d/t/pipelines/kernel/Feature.h index 6645c9bb6cc..cc1d66f6920 100644 --- a/cpp/open3d/t/pipelines/kernel/Feature.h +++ b/cpp/open3d/t/pipelines/kernel/Feature.h @@ -49,6 +49,19 @@ void ComputeFPFHFeatureCUDA( std::nullopt); #endif +#ifdef BUILD_SYCL_MODULE +void ComputeFPFHFeatureSYCL( + const core::Tensor &points, + const core::Tensor &normals, + const core::Tensor &indices, + const core::Tensor &distance2, + const core::Tensor &counts, + core::Tensor &fpfhs, + const std::optional &mask = std::nullopt, + const std::optional &map_batch_info_idx_to_point_idx = + std::nullopt); +#endif + } // namespace kernel } // namespace pipelines } // namespace t diff --git a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h index c9948b6a62d..5caef81ff49 100644 --- a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h +++ b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h @@ -104,6 +104,7 @@ OPEN3D_HOST_DEVICE void UpdateSPFHFeature(const scalar_t *feature, spfh[idx * 33 + h_index3 + 22] += hist_incr; } +#ifndef OPEN3D_SKIP_FPFH_MAIN #if defined(__CUDACC__) void ComputeFPFHFeatureCUDA #else @@ -291,6 +292,7 @@ void ComputeFPFHFeatureCPU }); }); } +#endif // OPEN3D_SKIP_FPFH_MAIN } // namespace kernel } // namespace pipelines diff --git a/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp new file mode 100644 index 00000000000..7338edc63f8 --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp @@ -0,0 +1,228 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +// Skip the CPU/CUDA main function definitions; only use the helper templates. +#define OPEN3D_SKIP_FPFH_MAIN +#include "open3d/t/pipelines/kernel/FeatureImpl.h" +#undef OPEN3D_SKIP_FPFH_MAIN + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { + +void ComputeFPFHFeatureSYCL( + const core::Tensor &points, + const core::Tensor &normals, + const core::Tensor &indices, + const core::Tensor &distance2, + const core::Tensor &counts, + core::Tensor &fpfhs, + const std::optional &mask, + const std::optional &map_info_idx_to_point_idx) { + const core::Dtype dtype = points.GetDtype(); + const core::Device device = points.GetDevice(); + const int64_t n_points = points.GetLength(); + + const bool filter_fpfh = + mask.has_value() && map_info_idx_to_point_idx.has_value(); + if (mask.has_value() ^ map_info_idx_to_point_idx.has_value()) { + utility::LogError( + "Parameters mask and map_info_idx_to_point_idx must " + "either be both provided or both not provided."); + } + if (filter_fpfh) { + if (mask.value().GetShape()[0] != n_points) { + utility::LogError( + "Parameter mask was provided, but its size {:d} should" + "be equal to the number of points {:d}.", + (int)mask.value().GetShape()[0], n_points); + } + if (map_info_idx_to_point_idx.value().GetShape()[0] != + counts.GetShape()[0] - (indices.GetShape().size() == 1 ? 1 : 0)) { + utility::LogError( + "Parameter map_info_idx_to_point_idx was provided, " + "but its size" + "{:d} should be equal to the size of counts {:d}.", + (int)map_info_idx_to_point_idx.value().GetShape()[0], + (int)counts.GetShape()[0]); + } + } + + core::Tensor map_spfh_info_idx_to_point_idx = + map_info_idx_to_point_idx.value_or( + core::Tensor::Empty({0}, core::Int64, device)); + + const core::Tensor map_fpfh_idx_to_point_idx = + filter_fpfh ? mask.value().NonZero().GetItem( + {core::TensorKey::Index(0)}) + : core::Tensor::Empty({0}, core::Int64, device); + + const int32_t n_fpfh = + filter_fpfh ? map_fpfh_idx_to_point_idx.GetLength() : n_points; + const int32_t n_spfh = + filter_fpfh ? map_spfh_info_idx_to_point_idx.GetLength() : n_points; + + core::Tensor spfhs = + core::Tensor::Zeros({n_spfh, 33}, dtype, fpfhs.GetDevice()); + + core::Tensor map_point_idx_to_spfh_idx; + if (filter_fpfh) { + map_point_idx_to_spfh_idx = core::Tensor::Full( + {n_points}, -1, core::Int64, fpfhs.GetDevice()); + map_point_idx_to_spfh_idx.IndexSet( + {map_spfh_info_idx_to_point_idx}, + core::Tensor::Arange(0, n_spfh, 1, core::Int64, + fpfhs.GetDevice())); + } else { + map_point_idx_to_spfh_idx = + core::Tensor::Empty({0}, core::Int64, fpfhs.GetDevice()); + } + + // Check the nns type (knn = hybrid = false, radius = true). + // The nns radius search mode will resulting a prefix sum 1D tensor. + bool is_radius_search; + int nn_size = 0; + if (indices.GetShape().size() == 1) { + is_radius_search = true; + } else { + is_radius_search = false; + nn_size = indices.GetShape()[1]; + } + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + const scalar_t *points_ptr = points.GetDataPtr(); + const scalar_t *normals_ptr = normals.GetDataPtr(); + const int32_t *indices_ptr = indices.GetDataPtr(); + const scalar_t *distance2_ptr = distance2.GetDataPtr(); + const int32_t *counts_ptr = counts.GetDataPtr(); + scalar_t *spfhs_ptr = spfhs.GetDataPtr(); + scalar_t *fpfhs_ptr = fpfhs.GetDataPtr(); + const int64_t *map_spfh_info_idx_to_point_idx_ptr = + map_spfh_info_idx_to_point_idx.GetDataPtr(); + const int64_t *map_fpfh_idx_to_point_idx_ptr = + map_fpfh_idx_to_point_idx.GetDataPtr(); + const int64_t *map_point_idx_to_spfh_idx_ptr = + map_point_idx_to_spfh_idx.GetDataPtr(); + + // Compute SPFH features for the points. + queue.parallel_for( + sycl::range<1>{(size_t)n_spfh}, + [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int64_t workload_point_idx = + filter_fpfh + ? map_spfh_info_idx_to_point_idx_ptr + [workload_idx] + : workload_idx; + int64_t idx = 3 * workload_point_idx; + const scalar_t *point = points_ptr + idx; + const scalar_t *normal = normals_ptr + idx; + + const int indice_size = + is_radius_search + ? (counts_ptr[workload_idx + 1] - + counts_ptr[workload_idx]) + : counts_ptr[workload_idx]; + + if (indice_size > 1) { + const scalar_t hist_incr = + 100.0 / static_cast( + indice_size - 1); + for (int i = 1; i < indice_size; i++) { + const int point_idx = + is_radius_search + ? indices_ptr + [i + + counts_ptr + [workload_idx]] + : indices_ptr[workload_idx * + nn_size + + i]; + + const scalar_t *point_ref = + points_ptr + 3 * point_idx; + const scalar_t *normal_ref = + normals_ptr + 3 * point_idx; + scalar_t fea[4] = {0}; + ComputePairFeature( + point, normal, point_ref, normal_ref, + fea); + UpdateSPFHFeature( + fea, workload_idx, hist_incr, + spfhs_ptr); + } + } + }) + .wait_and_throw(); + + // Compute FPFH features for the points. + queue.parallel_for( + sycl::range<1>{(size_t)n_fpfh}, + [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int64_t workload_spfh_idx = + filter_fpfh + ? map_point_idx_to_spfh_idx_ptr + [map_fpfh_idx_to_point_idx_ptr + [workload_idx]] + : workload_idx; + const int indice_size = + is_radius_search + ? (counts_ptr[workload_spfh_idx + 1] - + counts_ptr[workload_spfh_idx]) + : counts_ptr[workload_spfh_idx]; + if (indice_size > 1) { + scalar_t sum[3] = {0.0, 0.0, 0.0}; + for (int i = 1; i < indice_size; i++) { + const int idx = + is_radius_search + ? i + + counts_ptr + [workload_spfh_idx] + : workload_spfh_idx * nn_size + + i; + const scalar_t dist = distance2_ptr[idx]; + if (dist == 0.0) continue; + const int32_t spfh_idx = + filter_fpfh + ? map_point_idx_to_spfh_idx_ptr + [indices_ptr[idx]] + : indices_ptr[idx]; + for (int j = 0; j < 33; j++) { + const scalar_t val = + spfhs_ptr[spfh_idx * 33 + j] / + dist; + sum[j / 11] += val; + fpfhs_ptr[workload_idx * 33 + j] += val; + } + } + for (int j = 0; j < 3; j++) { + sum[j] = sum[j] != 0.0 ? 100.0 / sum[j] : 0.0; + } + for (int j = 0; j < 33; j++) { + fpfhs_ptr[workload_idx * 33 + j] *= + sum[j / 11]; + fpfhs_ptr[workload_idx * 33 + j] += + spfhs_ptr[workload_spfh_idx * 33 + j]; + } + } + }) + .wait_and_throw(); + }); +} + +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp index 9d5e825da9a..82a2c33ac93 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp @@ -60,6 +60,13 @@ void FillInRigidAlignmentTerm(core::Tensor &AtA, #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (AtA.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + FillInRigidAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs, + Ri_normal_ps, i, j, threshold); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device"); @@ -124,6 +131,16 @@ void FillInSLACAlignmentTerm(core::Tensor &AtA, #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (AtA.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + FillInSLACAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs, + normal_ps, Ri_normal_ps, + RjT_Ri_normal_ps, cgrid_idx_ps, + cgrid_idx_qs, cgrid_ratio_ps, + cgrid_ratio_qs, i, j, n, threshold); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device"); @@ -163,6 +180,15 @@ void FillInSLACRegularizerTerm(core::Tensor &AtA, positions_init, positions_curr, weight, n, anchor_idx); #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (AtA.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + FillInSLACRegularizerTermSYCL(AtA, Atb, residual, grid_idx, + grid_nbs_idx, grid_nbs_mask, + positions_init, positions_curr, weight, n, + anchor_idx); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device"); diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h index 1c529813841..91014b84dd2 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h @@ -130,6 +130,47 @@ void FillInSLACRegularizerTermCUDA(core::Tensor &AtA, int n, int anchor_idx); +#endif +#ifdef BUILD_SYCL_MODULE +void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_qs, + const core::Tensor &Tj_qs, + const core::Tensor &Ri_normal_ps, + int i, + int j, + float threshold); + +void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_qs, + const core::Tensor &Tj_qs, + const core::Tensor &normal_ps, + const core::Tensor &Ri_normal_ps, + const core::Tensor &RjT_Ri_normal_ps, + const core::Tensor &cgrid_idx_ps, + const core::Tensor &cgrid_idx_qs, + const core::Tensor &cgrid_ratio_qs, + const core::Tensor &cgrid_ratio_ps, + int i, + int j, + int n, + float threshold); + +void FillInSLACRegularizerTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &grid_idx, + const core::Tensor &grid_nbs_idx, + const core::Tensor &grid_nbs_mask, + const core::Tensor &positions_init, + const core::Tensor &positions_curr, + float weight, + int n, + int anchor_idx); + #endif } // namespace kernel diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h index e158809f6a4..21732102836 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h @@ -13,6 +13,7 @@ namespace open3d { namespace t { namespace pipelines { namespace kernel { +#ifndef OPEN3D_SKIP_FILL_IN_LS_MAIN #if defined(__CUDACC__) void FillInRigidAlignmentTermCUDA #else @@ -471,6 +472,7 @@ void FillInSLACRegularizerTermCPU } }); } +#endif // OPEN3D_SKIP_FILL_IN_LS_MAIN } // namespace kernel } // namespace pipelines } // namespace t diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp new file mode 100644 index 00000000000..e99b7f5ccb6 --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp @@ -0,0 +1,456 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +// Skip the CPU/CUDA main function definitions; include only helper includes. +#define OPEN3D_SKIP_FILL_IN_LS_MAIN +#include "open3d/t/pipelines/kernel/FillInLinearSystemImpl.h" +#undef OPEN3D_SKIP_FILL_IN_LS_MAIN + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/t/pipelines/kernel/FillInLinearSystem.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { + +void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_ps, + const core::Tensor &Tj_qs, + const core::Tensor &Ri_normal_ps, + int i, + int j, + float threshold) { + core::Device device = AtA.GetDevice(); + int64_t n = Ti_ps.GetLength(); + if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) { + utility::LogError( + "Unable to setup linear system: input length mismatch."); + } + + // First fill in a small 12 x 12 linear system. + core::Tensor AtA_local = + core::Tensor::Zeros({12, 12}, core::Float32, device); + core::Tensor Atb_local = core::Tensor::Zeros({12}, core::Float32, device); + + float *AtA_local_ptr = static_cast(AtA_local.GetDataPtr()); + float *Atb_local_ptr = static_cast(Atb_local.GetDataPtr()); + float *residual_ptr = static_cast(residual.GetDataPtr()); + + const float *Ti_ps_ptr = static_cast(Ti_ps.GetDataPtr()); + const float *Tj_qs_ptr = static_cast(Tj_qs.GetDataPtr()); + const float *Ri_normal_ps_ptr = + static_cast(Ri_normal_ps.GetDataPtr()); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + const float *p_prime = Ti_ps_ptr + 3 * workload_idx; + const float *q_prime = Tj_qs_ptr + 3 * workload_idx; + const float *normal_p_prime = + Ri_normal_ps_ptr + 3 * workload_idx; + + float r = (p_prime[0] - q_prime[0]) * + normal_p_prime[0] + + (p_prime[1] - q_prime[1]) * + normal_p_prime[1] + + (p_prime[2] - q_prime[2]) * normal_p_prime[2]; + if (sycl::fabs(r) > threshold) return; + + float J_ij[12]; + J_ij[0] = -q_prime[2] * normal_p_prime[1] + + q_prime[1] * normal_p_prime[2]; + J_ij[1] = q_prime[2] * normal_p_prime[0] - + q_prime[0] * normal_p_prime[2]; + J_ij[2] = -q_prime[1] * normal_p_prime[0] + + q_prime[0] * normal_p_prime[1]; + J_ij[3] = normal_p_prime[0]; + J_ij[4] = normal_p_prime[1]; + J_ij[5] = normal_p_prime[2]; + for (int k = 0; k < 6; ++k) { + J_ij[k + 6] = -J_ij[k]; + } + + for (int i_local = 0; i_local < 12; ++i_local) { + for (int j_local = 0; j_local < 12; ++j_local) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + AtA_local_ptr[i_local * 12 + j_local]) += + J_ij[i_local] * J_ij[j_local]; + } + sycl::atomic_ref( + Atb_local_ptr[i_local]) += + J_ij[i_local] * r; + } + sycl::atomic_ref( + *residual_ptr) += r * r; + }) + .wait_and_throw(); + + // Then fill-in the large linear system. + std::vector indices_vec(12); + for (int k = 0; k < 6; ++k) { + indices_vec[k] = i * 6 + k; + indices_vec[k + 6] = j * 6 + k; + } + + std::vector indices_i_vec; + std::vector indices_j_vec; + for (int local_i = 0; local_i < 12; ++local_i) { + for (int local_j = 0; local_j < 12; ++local_j) { + indices_i_vec.push_back(indices_vec[local_i]); + indices_j_vec.push_back(indices_vec[local_j]); + } + } + + core::Tensor indices(indices_vec, {12}, core::Int64, device); + core::Tensor indices_i(indices_i_vec, {12 * 12}, core::Int64, device); + core::Tensor indices_j(indices_j_vec, {12 * 12}, core::Int64, device); + + core::Tensor AtA_sub = AtA.IndexGet({indices_i, indices_j}); + AtA.IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.View({12 * 12})); + + core::Tensor Atb_sub = Atb.IndexGet({indices}); + Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1})); +} + +void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_Cps, + const core::Tensor &Tj_Cqs, + const core::Tensor &Cnormal_ps, + const core::Tensor &Ri_Cnormal_ps, + const core::Tensor &RjT_Ri_Cnormal_ps, + const core::Tensor &cgrid_idx_ps, + const core::Tensor &cgrid_idx_qs, + const core::Tensor &cgrid_ratio_qs, + const core::Tensor &cgrid_ratio_ps, + int i, + int j, + int n_frags, + float threshold) { + int64_t n = Ti_Cps.GetLength(); + if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n || + Ri_Cnormal_ps.GetLength() != n || + RjT_Ri_Cnormal_ps.GetLength() != n || + cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n || + cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) { + utility::LogError( + "Unable to setup linear system: input length mismatch."); + } + + core::Device device = AtA.GetDevice(); + int n_vars = Atb.GetLength(); + float *AtA_ptr = static_cast(AtA.GetDataPtr()); + float *Atb_ptr = static_cast(Atb.GetDataPtr()); + float *residual_ptr = static_cast(residual.GetDataPtr()); + + const float *Ti_Cps_ptr = + static_cast(Ti_Cps.GetDataPtr()); + const float *Tj_Cqs_ptr = + static_cast(Tj_Cqs.GetDataPtr()); + const float *Cnormal_ps_ptr = + static_cast(Cnormal_ps.GetDataPtr()); + const float *Ri_Cnormal_ps_ptr = + static_cast(Ri_Cnormal_ps.GetDataPtr()); + const float *RjT_Ri_Cnormal_ps_ptr = + static_cast(RjT_Ri_Cnormal_ps.GetDataPtr()); + + const int *cgrid_idx_ps_ptr = + static_cast(cgrid_idx_ps.GetDataPtr()); + const int *cgrid_idx_qs_ptr = + static_cast(cgrid_idx_qs.GetDataPtr()); + const float *cgrid_ratio_ps_ptr = + static_cast(cgrid_ratio_ps.GetDataPtr()); + const float *cgrid_ratio_qs_ptr = + static_cast(cgrid_ratio_qs.GetDataPtr()); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx; + const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx; + const float *Cnormal_p = + Cnormal_ps_ptr + 3 * workload_idx; + const float *Ri_Cnormal_p = + Ri_Cnormal_ps_ptr + 3 * workload_idx; + const float *RjTRi_Cnormal_p = + RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx; + + const int *cgrid_idx_p = + cgrid_idx_ps_ptr + 8 * workload_idx; + const int *cgrid_idx_q = + cgrid_idx_qs_ptr + 8 * workload_idx; + const float *cgrid_ratio_p = + cgrid_ratio_ps_ptr + 8 * workload_idx; + const float *cgrid_ratio_q = + cgrid_ratio_qs_ptr + 8 * workload_idx; + + float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] + + (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] + + (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2]; + if (sycl::fabs(r) > threshold) return; + + float J[60]; + int idx[60]; + + J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + + Tj_Cq[1] * Ri_Cnormal_p[2]; + J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - + Tj_Cq[0] * Ri_Cnormal_p[2]; + J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + + Tj_Cq[0] * Ri_Cnormal_p[1]; + J[3] = Ri_Cnormal_p[0]; + J[4] = Ri_Cnormal_p[1]; + J[5] = Ri_Cnormal_p[2]; + + for (int k = 0; k < 6; ++k) { + J[k + 6] = -J[k]; + idx[k + 0] = 6 * i + k; + idx[k + 6] = 6 * j + k; + } + + for (int k = 0; k < 8; ++k) { + J[12 + k * 3 + 0] = + cgrid_ratio_p[k] * Cnormal_p[0]; + J[12 + k * 3 + 1] = + cgrid_ratio_p[k] * Cnormal_p[1]; + J[12 + k * 3 + 2] = + cgrid_ratio_p[k] * Cnormal_p[2]; + idx[12 + k * 3 + 0] = + 6 * n_frags + cgrid_idx_p[k] * 3 + 0; + idx[12 + k * 3 + 1] = + 6 * n_frags + cgrid_idx_p[k] * 3 + 1; + idx[12 + k * 3 + 2] = + 6 * n_frags + cgrid_idx_p[k] * 3 + 2; + } + + for (int k = 0; k < 8; ++k) { + J[36 + k * 3 + 0] = + -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0]; + J[36 + k * 3 + 1] = + -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1]; + J[36 + k * 3 + 2] = + -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2]; + idx[36 + k * 3 + 0] = + 6 * n_frags + cgrid_idx_q[k] * 3 + 0; + idx[36 + k * 3 + 1] = + 6 * n_frags + cgrid_idx_q[k] * 3 + 1; + idx[36 + k * 3 + 2] = + 6 * n_frags + cgrid_idx_q[k] * 3 + 2; + } + + for (int ki = 0; ki < 60; ++ki) { + for (int kj = 0; kj < 60; ++kj) { + float AtA_ij = J[ki] * J[kj]; + int ij = idx[ki] * n_vars + idx[kj]; + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + AtA_ptr[ij]) += AtA_ij; + } + sycl::atomic_ref( + Atb_ptr[idx[ki]]) += J[ki] * r; + } + sycl::atomic_ref( + *residual_ptr) += r * r; + }) + .wait_and_throw(); +} + +void FillInSLACRegularizerTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &grid_idx, + const core::Tensor &grid_nbs_idx, + const core::Tensor &grid_nbs_mask, + const core::Tensor &positions_init, + const core::Tensor &positions_curr, + float weight, + int n_frags, + int anchor_idx) { + int64_t n = grid_idx.GetLength(); + int64_t n_vars = Atb.GetLength(); + + core::Device device = AtA.GetDevice(); + float *AtA_ptr = static_cast(AtA.GetDataPtr()); + float *Atb_ptr = static_cast(Atb.GetDataPtr()); + float *residual_ptr = static_cast(residual.GetDataPtr()); + + const int *grid_idx_ptr = + static_cast(grid_idx.GetDataPtr()); + const int *grid_nbs_idx_ptr = + static_cast(grid_nbs_idx.GetDataPtr()); + const bool *grid_nbs_mask_ptr = + static_cast(grid_nbs_mask.GetDataPtr()); + const float *positions_init_ptr = + static_cast(positions_init.GetDataPtr()); + const float *positions_curr_ptr = + static_cast(positions_curr.GetDataPtr()); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int idx_i = grid_idx_ptr[workload_idx]; + + const int *idx_nbs = + grid_nbs_idx_ptr + 6 * workload_idx; + const bool *mask_nbs = + grid_nbs_mask_ptr + 6 * workload_idx; + + float cov[3][3] = {{0}}; + float U[3][3], V[3][3], S[3]; + + int cnt = 0; + for (int k = 0; k < 6; ++k) { + if (!mask_nbs[k]) continue; + int idx_k = idx_nbs[k]; + + float diff_ik_init[3] = { + positions_init_ptr[idx_i * 3 + 0] - + positions_init_ptr[idx_k * 3 + 0], + positions_init_ptr[idx_i * 3 + 1] - + positions_init_ptr[idx_k * 3 + 1], + positions_init_ptr[idx_i * 3 + 2] - + positions_init_ptr[idx_k * 3 + 2]}; + float diff_ik_curr[3] = { + positions_curr_ptr[idx_i * 3 + 0] - + positions_curr_ptr[idx_k * 3 + 0], + positions_curr_ptr[idx_i * 3 + 1] - + positions_curr_ptr[idx_k * 3 + 1], + positions_curr_ptr[idx_i * 3 + 2] - + positions_curr_ptr[idx_k * 3 + 2]}; + + for (int ii = 0; ii < 3; ++ii) { + for (int jj = 0; jj < 3; ++jj) { + cov[ii][jj] += diff_ik_init[ii] * + diff_ik_curr[jj]; + } + } + ++cnt; + } + + if (cnt < 3) return; + + core::linalg::kernel::svd3x3(*cov, *U, S, *V); + + float R[3][3]; + core::linalg::kernel::transpose3x3_(*U); + core::linalg::kernel::matmul3x3_3x3(*V, *U, *R); + + float d = core::linalg::kernel::det3x3(*R); + if (d < 0) { + U[2][0] = -U[2][0]; + U[2][1] = -U[2][1]; + U[2][2] = -U[2][2]; + core::linalg::kernel::matmul3x3_3x3(*V, *U, *R); + } + + if (idx_i == anchor_idx) { + R[0][0] = R[1][1] = R[2][2] = 1; + R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = + R[2][1] = 0; + } + + for (int k = 0; k < 6; ++k) { + if (!mask_nbs[k]) continue; + int idx_k = idx_nbs[k]; + + float diff_ik_init[3] = { + positions_init_ptr[idx_i * 3 + 0] - + positions_init_ptr[idx_k * 3 + 0], + positions_init_ptr[idx_i * 3 + 1] - + positions_init_ptr[idx_k * 3 + 1], + positions_init_ptr[idx_i * 3 + 2] - + positions_init_ptr[idx_k * 3 + 2]}; + float diff_ik_curr[3] = { + positions_curr_ptr[idx_i * 3 + 0] - + positions_curr_ptr[idx_k * 3 + 0], + positions_curr_ptr[idx_i * 3 + 1] - + positions_curr_ptr[idx_k * 3 + 1], + positions_curr_ptr[idx_i * 3 + 2] - + positions_curr_ptr[idx_k * 3 + 2]}; + float R_diff_ik_curr[3]; + core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init, + R_diff_ik_curr); + + float local_r[3]; + local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0]; + local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1]; + local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2]; + + int offset_idx_i = 3 * idx_i + 6 * n_frags; + int offset_idx_k = 3 * idx_k + 6 * n_frags; + + sycl::atomic_ref( + *residual_ptr) += + weight * (local_r[0] * local_r[0] + + local_r[1] * local_r[1] + + local_r[2] * local_r[2]); + + for (int axis = 0; axis < 3; ++axis) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + AtA_ptr[(offset_idx_i + axis) * n_vars + + offset_idx_i + axis]) += weight; + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + AtA_ptr[(offset_idx_k + axis) * n_vars + + offset_idx_k + axis]) += weight; + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + AtA_ptr[(offset_idx_i + axis) * n_vars + + offset_idx_k + axis]) -= weight; + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + AtA_ptr[(offset_idx_k + axis) * n_vars + + offset_idx_i + axis]) -= weight; + + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + Atb_ptr[offset_idx_i + axis]) += + weight * local_r[axis]; + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + Atb_ptr[offset_idx_k + axis]) -= + weight * local_r[axis]; + } + } + }) + .wait_and_throw(); +} + +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp index ac61daeaa2c..6b54fb41961 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp @@ -58,6 +58,15 @@ void ComputeOdometryResultPointToPlane( target_vertex_map, target_normal_map, intrinsics_d, trans_d, delta, inlier_residual, inlier_count, depth_outlier_trunc, depth_huber_delta); + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeOdometryResultPointToPlaneSYCL( + source_vertex_map, target_vertex_map, target_normal_map, + intrinsics_d, trans_d, delta, inlier_residual, inlier_count, + depth_outlier_trunc, depth_huber_delta); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -118,6 +127,16 @@ void ComputeOdometryResultIntensity(const core::Tensor &source_depth, target_intensity_dx, target_intensity_dy, source_vertex_map, intrinsics_d, trans_d, delta, inlier_residual, inlier_count, depth_outlier_trunc, intensity_huber_delta); + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeOdometryResultIntensitySYCL( + source_depth, target_depth, source_intensity, target_intensity, + target_intensity_dx, target_intensity_dy, source_vertex_map, + intrinsics_d, trans_d, delta, inlier_residual, inlier_count, + depth_outlier_trunc, intensity_huber_delta); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -187,6 +206,17 @@ void ComputeOdometryResultHybrid(const core::Tensor &source_depth, source_vertex_map, intrinsics_d, trans_d, delta, inlier_residual, inlier_count, depth_outlier_trunc, depth_huber_delta, intensity_huber_delta); + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeOdometryResultHybridSYCL( + source_depth, target_depth, source_intensity, target_intensity, + target_depth_dx, target_depth_dy, target_intensity_dx, + target_intensity_dy, source_vertex_map, intrinsics_d, trans_d, + delta, inlier_residual, inlier_count, depth_outlier_trunc, + depth_huber_delta, intensity_huber_delta); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -222,6 +252,15 @@ void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map, CUDA_CALL(ComputeOdometryInformationMatrixCUDA, source_vertex_map, target_vertex_map, intrinsic, source_to_target, square_dist_thr, information); + } else if (device.GetType() == core::Device::DeviceType::SYCL) { +#ifdef BUILD_SYCL_MODULE + ComputeOdometryInformationMatrixSYCL(source_vertex_map, + target_vertex_map, intrinsic, + source_to_target, square_dist_thr, + information); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h index d1aacbbeac3..ff854a406e2 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h @@ -125,6 +125,61 @@ void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_depth, core::Tensor& information); #endif +#ifdef BUILD_SYCL_MODULE +void ComputeOdometryResultPointToPlaneSYCL( + const core::Tensor& source_vertex_map, + const core::Tensor& target_vertex_map, + const core::Tensor& source_normal_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta); + +void ComputeOdometryResultIntensitySYCL( + const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& source_intensity, + const core::Tensor& target_intensity, + const core::Tensor& target_intensity_dx, + const core::Tensor& target_intensity_dy, + const core::Tensor& source_vertex_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float intensity_huber_delta); + +void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& source_intensity, + const core::Tensor& target_intensity, + const core::Tensor& target_depth_dx, + const core::Tensor& target_depth_dy, + const core::Tensor& target_intensity_dx, + const core::Tensor& target_intensity_dy, + const core::Tensor& source_vertex_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta, + const float intensity_huber_delta); + +void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& intrinsic, + const core::Tensor& source_to_target, + const float square_dist_thr, + core::Tensor& information); +#endif + } // namespace odometry } // namespace kernel } // namespace pipelines diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp new file mode 100644 index 00000000000..f48468763f1 --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp @@ -0,0 +1,366 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/t/geometry/kernel/GeometryIndexer.h" +#include "open3d/t/geometry/kernel/GeometryMacros.h" +#include "open3d/t/pipelines/kernel/RGBDOdometryImpl.h" +#include "open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h" +#include "open3d/t/pipelines/kernel/TransformationConverter.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { +namespace odometry { + +using t::geometry::kernel::NDArrayIndexer; +using t::geometry::kernel::TransformIndexer; + +static constexpr int kReduceDimOdometry = 29; +static constexpr int kJtJDimOdometry = 21; + +void ComputeOdometryResultPointToPlaneSYCL( + const core::Tensor& source_vertex_map, + const core::Tensor& target_vertex_map, + const core::Tensor& target_normal_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta) { + NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); + NDArrayIndexer target_vertex_indexer(target_vertex_map, 2); + NDArrayIndexer target_normal_indexer(target_normal_map, 2); + + core::Device device = source_vertex_map.GetDevice(); + core::Tensor trans = init_source_to_target; + TransformIndexer ti(intrinsics, trans); + + const int64_t rows = source_vertex_indexer.GetShape(0); + const int64_t cols = source_vertex_indexer.GetShape(1); + const int64_t n = rows * cols; + + core::Tensor global_sum = + core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); + float* global_sum_ptr = global_sum.GetDataPtr(); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + const int workload_idx = id[0]; + const int y = workload_idx / cols; + const int x = workload_idx % cols; + + float J[6] = {0}; + float r = 0; + const bool valid = GetJacobianPointToPlane( + x, y, depth_outlier_trunc, source_vertex_indexer, + target_vertex_indexer, target_normal_indexer, ti, + J, r); + + if (valid) { + const float d_huber = + HuberDeriv(r, depth_huber_delta); + const float r_huber = + HuberLoss(r, depth_huber_delta); + + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[offset++]) += + J[i] * J[j]; + } + } + for (int i = 0; i < 6; ++i) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[21 + i]) += + J[i] * d_huber; + } + sycl::atomic_ref( + global_sum_ptr[27]) += r_huber; + sycl::atomic_ref( + global_sum_ptr[28]) += 1.0f; + } + }) + .wait_and_throw(); + + DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); +} + +void ComputeOdometryResultIntensitySYCL( + const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& source_intensity, + const core::Tensor& target_intensity, + const core::Tensor& target_intensity_dx, + const core::Tensor& target_intensity_dy, + const core::Tensor& source_vertex_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float intensity_huber_delta) { + NDArrayIndexer source_depth_indexer(source_depth, 2); + NDArrayIndexer target_depth_indexer(target_depth, 2); + NDArrayIndexer source_intensity_indexer(source_intensity, 2); + NDArrayIndexer target_intensity_indexer(target_intensity, 2); + NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2); + NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2); + NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); + + core::Device device = source_vertex_map.GetDevice(); + core::Tensor trans = init_source_to_target; + TransformIndexer ti(intrinsics, trans); + + const int64_t rows = source_vertex_indexer.GetShape(0); + const int64_t cols = source_vertex_indexer.GetShape(1); + const int64_t n = rows * cols; + + core::Tensor global_sum = + core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); + float* global_sum_ptr = global_sum.GetDataPtr(); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + const int workload_idx = id[0]; + const int y = workload_idx / cols; + const int x = workload_idx % cols; + + float J_I[6] = {0}; + float r_I = 0; + const bool valid = GetJacobianIntensity( + x, y, depth_outlier_trunc, source_depth_indexer, + target_depth_indexer, source_intensity_indexer, + target_intensity_indexer, + target_intensity_dx_indexer, + target_intensity_dy_indexer, source_vertex_indexer, + ti, J_I, r_I); + + if (valid) { + const float d_huber = + HuberDeriv(r_I, intensity_huber_delta); + const float r_huber = + HuberLoss(r_I, intensity_huber_delta); + + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[offset++]) += + J_I[i] * J_I[j]; + } + } + for (int i = 0; i < 6; ++i) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[21 + i]) += + J_I[i] * d_huber; + } + sycl::atomic_ref( + global_sum_ptr[27]) += r_huber; + sycl::atomic_ref( + global_sum_ptr[28]) += 1.0f; + } + }) + .wait_and_throw(); + + DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); +} + +void ComputeOdometryResultHybridSYCL( + const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& source_intensity, + const core::Tensor& target_intensity, + const core::Tensor& target_depth_dx, + const core::Tensor& target_depth_dy, + const core::Tensor& target_intensity_dx, + const core::Tensor& target_intensity_dy, + const core::Tensor& source_vertex_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta, + const float intensity_huber_delta) { + NDArrayIndexer source_depth_indexer(source_depth, 2); + NDArrayIndexer target_depth_indexer(target_depth, 2); + NDArrayIndexer source_intensity_indexer(source_intensity, 2); + NDArrayIndexer target_intensity_indexer(target_intensity, 2); + NDArrayIndexer target_depth_dx_indexer(target_depth_dx, 2); + NDArrayIndexer target_depth_dy_indexer(target_depth_dy, 2); + NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2); + NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2); + NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); + + core::Device device = source_vertex_map.GetDevice(); + core::Tensor trans = init_source_to_target; + TransformIndexer ti(intrinsics, trans); + + const int64_t rows = source_vertex_indexer.GetShape(0); + const int64_t cols = source_vertex_indexer.GetShape(1); + const int64_t n = rows * cols; + + core::Tensor global_sum = + core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); + float* global_sum_ptr = global_sum.GetDataPtr(); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + const int workload_idx = id[0]; + const int y = workload_idx / cols; + const int x = workload_idx % cols; + + float J_I[6] = {0}, J_D[6] = {0}; + float r_I = 0, r_D = 0; + const bool valid = GetJacobianHybrid( + x, y, depth_outlier_trunc, source_depth_indexer, + target_depth_indexer, source_intensity_indexer, + target_intensity_indexer, target_depth_dx_indexer, + target_depth_dy_indexer, + target_intensity_dx_indexer, + target_intensity_dy_indexer, source_vertex_indexer, + ti, J_I, J_D, r_I, r_D); + + if (valid) { + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[offset++]) += + J_I[i] * J_I[j] + J_D[i] * J_D[j]; + } + } + for (int i = 0; i < 6; ++i) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[21 + i]) += + J_I[i] * + HuberDeriv(r_I, + intensity_huber_delta) + + J_D[i] * HuberDeriv(r_D, depth_huber_delta); + } + sycl::atomic_ref( + global_sum_ptr[27]) += + HuberLoss(r_I, intensity_huber_delta) + + HuberLoss(r_D, depth_huber_delta); + sycl::atomic_ref( + global_sum_ptr[28]) += 1.0f; + } + }) + .wait_and_throw(); + + DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); +} + +void ComputeOdometryInformationMatrixSYCL( + const core::Tensor& source_vertex_map, + const core::Tensor& target_vertex_map, + const core::Tensor& intrinsic, + const core::Tensor& source_to_target, + const float square_dist_thr, + core::Tensor& information) { + NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); + NDArrayIndexer target_vertex_indexer(target_vertex_map, 2); + + core::Device device = source_vertex_map.GetDevice(); + core::Tensor trans = source_to_target; + TransformIndexer ti(intrinsic, trans); + + const int64_t rows = source_vertex_indexer.GetShape(0); + const int64_t cols = source_vertex_indexer.GetShape(1); + const int64_t n = rows * cols; + + core::Tensor global_sum = + core::Tensor::Zeros({kJtJDimOdometry}, core::Float32, device); + float* global_sum_ptr = global_sum.GetDataPtr(); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + const int workload_idx = id[0]; + const int y = workload_idx / cols; + const int x = workload_idx % cols; + + float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; + float rx = 0, ry = 0, rz = 0; + const bool valid = GetJacobianPointToPoint( + x, y, square_dist_thr, source_vertex_indexer, + target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, + rz); + + if (valid) { + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[offset++]) += + J_x[i] * J_x[j] + J_y[i] * J_y[j] + + J_z[i] * J_z[j]; + } + } + } + }) + .wait_and_throw(); + + const core::Device host(core::Device("CPU:0")); + information = core::Tensor::Empty({6, 6}, core::Float64, host); + global_sum = global_sum.To(host, core::Float64); + + double* info_ptr = information.GetDataPtr(); + double* reduction_ptr = global_sum.GetDataPtr(); + for (int j = 0; j < 6; j++) { + const int64_t reduction_idx = ((j * (j + 1)) / 2); + for (int k = 0; k <= j; k++) { + info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k]; + info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k]; + } + } +} + +} // namespace odometry +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/Registration.cpp b/cpp/open3d/t/pipelines/kernel/Registration.cpp index 326671431f3..b3ebf4f5508 100644 --- a/cpp/open3d/t/pipelines/kernel/Registration.cpp +++ b/cpp/open3d/t/pipelines/kernel/Registration.cpp @@ -41,6 +41,16 @@ core::Tensor ComputePosePointToPlane(const core::Tensor &source_points, target_points.Contiguous(), target_normals.Contiguous(), correspondence_indices.Contiguous(), pose, residual, inlier_count, source_points.GetDtype(), device, kernel); + } else if (source_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputePosePointToPlaneSYCL( + source_points.Contiguous(), target_points.Contiguous(), + target_normals.Contiguous(), + correspondence_indices.Contiguous(), pose, residual, + inlier_count, source_points.GetDtype(), device, kernel); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -85,6 +95,18 @@ core::Tensor ComputePoseColoredICP(const core::Tensor &source_points, correspondence_indices.Contiguous(), pose, residual, inlier_count, source_points.GetDtype(), device, kernel, lambda_geometric); + } else if (source_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputePoseColoredICPSYCL( + source_points.Contiguous(), source_colors.Contiguous(), + target_points.Contiguous(), target_normals.Contiguous(), + target_colors.Contiguous(), target_color_gradients.Contiguous(), + correspondence_indices.Contiguous(), pose, residual, + inlier_count, source_points.GetDtype(), device, kernel, + lambda_geometric); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -191,6 +213,21 @@ core::Tensor ComputePoseDopplerICP( v_v_in_V.Contiguous(), period, reject_outliers, doppler_outlier_threshold, kernel_geometric, kernel_doppler, lambda_doppler); + } else if (device_type == core::Device::DeviceType::SYCL) { +#ifdef BUILD_SYCL_MODULE + ComputePoseDopplerICPSYCL( + source_points.Contiguous(), source_dopplers.Contiguous(), + source_directions.Contiguous(), target_points.Contiguous(), + target_normals.Contiguous(), + correspondence_indices.Contiguous(), output_pose, residual, + inlier_count, dtype, device, R_S_to_V.Contiguous(), + r_v_to_s_in_V.Contiguous(), w_v_in_V.Contiguous(), + v_v_in_V.Contiguous(), period, reject_outliers, + doppler_outlier_threshold, kernel_geometric, kernel_doppler, + lambda_doppler); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } @@ -270,6 +307,51 @@ std::tuple ComputeRtPointToPoint( t = mean_t.Reshape({-1}) - R.Matmul(mean_s.T()).Reshape({-1}); #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (source_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + // Use tensor operations which work on SYCL devices. + core::Tensor valid = correspondence_indices.Ne(-1).Reshape({-1}); + + if (valid.GetLength() == 0) { + utility::LogError("No valid correspondence present."); + } + + core::Tensor source_indices = + core::Tensor::Arange(0, source_points.GetShape()[0], 1, + core::Int64, device) + .IndexGet({valid}); + core::Tensor target_indices = + correspondence_indices.IndexGet({valid}).Reshape({-1}); + + inlier_count = source_indices.GetLength(); + + core::Tensor source_select = source_points.IndexGet({source_indices}); + core::Tensor target_select = target_points.IndexGet({target_indices}); + + core::Tensor mean_s = source_select.Mean({0}, true); + core::Tensor mean_t = target_select.Mean({0}, true); + + core::Device host("CPU:0"); + core::Tensor Sxy = (target_select - mean_t) + .T() + .Matmul(source_select - mean_s) + .Div_(static_cast(inlier_count)) + .To(host, core::Float64); + + mean_s = mean_s.To(host, core::Float64); + mean_t = mean_t.To(host, core::Float64); + + core::Tensor U, D, VT; + std::tie(U, D, VT) = Sxy.SVD(); + core::Tensor S = core::Tensor::Eye(3, core::Float64, host); + if (U.Det() * (VT.T()).Det() < 0) { + S[-1][-1] = -1; + } + R = U.Matmul(S.Matmul(VT)); + t = mean_t.Reshape({-1}) - R.Matmul(mean_s.T()).Reshape({-1}); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device."); @@ -294,6 +376,15 @@ core::Tensor ComputeInformationMatrix( CUDA_CALL(ComputeInformationMatrixCUDA, target_points.Contiguous(), correspondence_indices.Contiguous(), information_matrix, target_points.GetDtype(), device); + } else if (target_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ComputeInformationMatrixSYCL(target_points.Contiguous(), + correspondence_indices.Contiguous(), + information_matrix, + target_points.GetDtype(), device); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device."); } diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h index 3f08ca4c811..f3b65414f88 100644 --- a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h @@ -149,6 +149,63 @@ void ComputeInformationMatrixCUDA(const core::Tensor &target_points, const core::Device &device); #endif +#ifdef BUILD_SYCL_MODULE +void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel); + +void ComputePoseColoredICPSYCL(const core::Tensor &source_points, + const core::Tensor &source_colors, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &target_colors, + const core::Tensor &target_color_gradients, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel, + const double &lambda_geometric); + +void ComputePoseDopplerICPSYCL( + const core::Tensor &source_points, + const core::Tensor &source_dopplers, + const core::Tensor &source_directions, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &output_pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const core::Tensor &R_S_to_V, + const core::Tensor &r_v_to_s_in_V, + const core::Tensor &w_v_in_V, + const core::Tensor &v_v_in_V, + const double period, + const bool reject_dynamic_outliers, + const double doppler_outlier_threshold, + const registration::RobustKernel &kernel_geometric, + const registration::RobustKernel &kernel_doppler, + const double lambda_doppler); + +void ComputeInformationMatrixSYCL(const core::Tensor &target_points, + const core::Tensor &correspondence_indices, + core::Tensor &information_matrix, + const core::Dtype &dtype, + const core::Device &device); +#endif + template OPEN3D_HOST_DEVICE inline bool GetJacobianPointToPlane( int64_t workload_idx, diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp new file mode 100644 index 00000000000..17bf9768f93 --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp @@ -0,0 +1,462 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/t/pipelines/kernel/RegistrationImpl.h" +#include "open3d/t/pipelines/kernel/TransformationConverter.h" +#include "open3d/t/pipelines/registration/RobustKernel.h" +#include "open3d/t/pipelines/registration/RobustKernelImpl.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { + +static constexpr int kReduceDim = 29; // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier) + +void ComputePosePointToPlaneSYCL( + const core::Tensor &source_points, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel) { + const int n = source_points.GetLength(); + + core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + scalar_t *global_sum_ptr = global_sum.GetDataPtr(); + + DISPATCH_ROBUST_KERNEL_FUNCTION( + kernel.type_, scalar_t, kernel.scaling_parameter_, + kernel.shape_parameter_, [&]() { + const scalar_t *source_points_ptr = + source_points.GetDataPtr(); + const scalar_t *target_points_ptr = + target_points.GetDataPtr(); + const scalar_t *target_normals_ptr = + target_normals.GetDataPtr(); + const int64_t *correspondence_indices_ptr = + correspondence_indices.GetDataPtr(); + + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + const int workload_idx = id[0]; + scalar_t J_ij[6] = {0}; + scalar_t r = 0; + const bool valid = + GetJacobianPointToPlane( + workload_idx, + source_points_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + J_ij, r); + + if (valid) { + const scalar_t w = + GetWeightFromRobustKernel(r); + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; ++k) { + sycl::atomic_ref< + scalar_t, + sycl::memory_order:: + acq_rel, + sycl::memory_scope:: + device>( + global_sum_ptr[i]) += + J_ij[j] * w * J_ij[k]; + ++i; + } + sycl::atomic_ref< + scalar_t, + sycl::memory_order:: + acq_rel, + sycl::memory_scope:: + device>( + global_sum_ptr[21 + j]) += + J_ij[j] * w * r; + } + sycl::atomic_ref< + scalar_t, + sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[27]) += r; + sycl::atomic_ref< + scalar_t, + sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[28]) += + scalar_t(1); + } + }) + .wait_and_throw(); + }); + }); + + DecodeAndSolve6x6(global_sum, pose, residual, inlier_count); +} + +void ComputePoseColoredICPSYCL( + const core::Tensor &source_points, + const core::Tensor &source_colors, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &target_colors, + const core::Tensor &target_color_gradients, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel, + const double &lambda_geometric) { + const int n = source_points.GetLength(); + + core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + const scalar_t sqrt_lambda_geometric = + static_cast(sqrt(lambda_geometric)); + const scalar_t sqrt_lambda_photometric = + static_cast(sqrt(1.0 - lambda_geometric)); + scalar_t *global_sum_ptr = global_sum.GetDataPtr(); + + DISPATCH_ROBUST_KERNEL_FUNCTION( + kernel.type_, scalar_t, kernel.scaling_parameter_, + kernel.shape_parameter_, [&]() { + const scalar_t *source_points_ptr = + source_points.GetDataPtr(); + const scalar_t *source_colors_ptr = + source_colors.GetDataPtr(); + const scalar_t *target_points_ptr = + target_points.GetDataPtr(); + const scalar_t *target_normals_ptr = + target_normals.GetDataPtr(); + const scalar_t *target_colors_ptr = + target_colors.GetDataPtr(); + const scalar_t *target_color_gradients_ptr = + target_color_gradients.GetDataPtr(); + const int64_t *correspondence_indices_ptr = + correspondence_indices.GetDataPtr(); + + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + const int workload_idx = id[0]; + scalar_t J_G[6] = {0}, J_I[6] = {0}; + scalar_t r_G = 0, r_I = 0; + + const bool valid = + GetJacobianColoredICP( + workload_idx, + source_points_ptr, + source_colors_ptr, + target_points_ptr, + target_normals_ptr, + target_colors_ptr, + target_color_gradients_ptr, + correspondence_indices_ptr, + sqrt_lambda_geometric, + sqrt_lambda_photometric, + J_G, J_I, r_G, r_I); + + if (valid) { + const scalar_t w_G = + GetWeightFromRobustKernel(r_G); + const scalar_t w_I = + GetWeightFromRobustKernel(r_I); + + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; ++k) { + sycl::atomic_ref< + scalar_t, + sycl::memory_order:: + acq_rel, + sycl::memory_scope:: + device>( + global_sum_ptr[i]) += + J_G[j] * w_G * + J_G[k] + + J_I[j] * w_I * J_I[k]; + ++i; + } + sycl::atomic_ref< + scalar_t, + sycl::memory_order:: + acq_rel, + sycl::memory_scope:: + device>( + global_sum_ptr[21 + j]) += + J_G[j] * w_G * r_G + + J_I[j] * w_I * r_I; + } + sycl::atomic_ref< + scalar_t, + sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[27]) += + r_G * r_G + r_I * r_I; + sycl::atomic_ref< + scalar_t, + sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[28]) += + scalar_t(1); + } + }) + .wait_and_throw(); + }); + }); + + DecodeAndSolve6x6(global_sum, pose, residual, inlier_count); +} + +void ComputePoseDopplerICPSYCL( + const core::Tensor &source_points, + const core::Tensor &source_dopplers, + const core::Tensor &source_directions, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &output_pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const core::Tensor &R_S_to_V, + const core::Tensor &r_v_to_s_in_V, + const core::Tensor &w_v_in_V, + const core::Tensor &v_v_in_V, + const double period, + const bool reject_dynamic_outliers, + const double doppler_outlier_threshold, + const registration::RobustKernel &kernel_geometric, + const registration::RobustKernel &kernel_doppler, + const double lambda_doppler) { + const int n = source_points.GetLength(); + + core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); + core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + const scalar_t sqrt_lambda_geometric = + sqrt(1.0 - static_cast(lambda_doppler)); + const scalar_t sqrt_lambda_doppler = sqrt(lambda_doppler); + const scalar_t sqrt_lambda_doppler_by_dt = + sqrt_lambda_doppler / static_cast(period); + + // Pre-compute v_s_in_S using a single-task kernel. + { + const scalar_t *R_S_to_V_ptr = R_S_to_V.GetDataPtr(); + const scalar_t *r_v_to_s_in_V_ptr = + r_v_to_s_in_V.GetDataPtr(); + const scalar_t *w_v_in_V_ptr = w_v_in_V.GetDataPtr(); + const scalar_t *v_v_in_V_ptr = v_v_in_V.GetDataPtr(); + scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr(); + queue.single_task([=]() { + PreComputeForDopplerICP(R_S_to_V_ptr, r_v_to_s_in_V_ptr, + w_v_in_V_ptr, v_v_in_V_ptr, + v_s_in_S_ptr); + }).wait_and_throw(); + } + + scalar_t *global_sum_ptr = global_sum.GetDataPtr(); + + DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION( + scalar_t, kernel_geometric.type_, + kernel_geometric.scaling_parameter_, kernel_doppler.type_, + kernel_doppler.scaling_parameter_, [&]() { + const scalar_t *source_points_ptr = + source_points.GetDataPtr(); + const scalar_t *source_dopplers_ptr = + source_dopplers.GetDataPtr(); + const scalar_t *source_directions_ptr = + source_directions.GetDataPtr(); + const scalar_t *target_points_ptr = + target_points.GetDataPtr(); + const scalar_t *target_normals_ptr = + target_normals.GetDataPtr(); + const int64_t *correspondence_indices_ptr = + correspondence_indices.GetDataPtr(); + const scalar_t *R_S_to_V_ptr = + R_S_to_V.GetDataPtr(); + const scalar_t *r_v_to_s_in_V_ptr = + r_v_to_s_in_V.GetDataPtr(); + const scalar_t *v_s_in_S_ptr = + v_s_in_S.GetDataPtr(); + + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + const int workload_idx = id[0]; + scalar_t J_G[6] = {0}, J_D[6] = {0}; + scalar_t r_G = 0, r_D = 0; + + const bool valid = + GetJacobianDopplerICP( + workload_idx, + source_points_ptr, + source_dopplers_ptr, + source_directions_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + R_S_to_V_ptr, + r_v_to_s_in_V_ptr, + v_s_in_S_ptr, + reject_dynamic_outliers, + static_cast( + doppler_outlier_threshold), + sqrt_lambda_geometric, + sqrt_lambda_doppler, + sqrt_lambda_doppler_by_dt, + J_G, J_D, r_G, r_D); + + if (valid) { + const scalar_t w_G = + GetWeightFromRobustKernelFirst( + r_G); + const scalar_t w_D = + GetWeightFromRobustKernelSecond( + r_D); + + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; ++k) { + sycl::atomic_ref< + scalar_t, + sycl::memory_order:: + acq_rel, + sycl::memory_scope:: + device>( + global_sum_ptr[i]) += + J_G[j] * w_G * + J_G[k] + + J_D[j] * w_D * J_D[k]; + ++i; + } + sycl::atomic_ref< + scalar_t, + sycl::memory_order:: + acq_rel, + sycl::memory_scope:: + device>( + global_sum_ptr[21 + j]) += + J_G[j] * w_G * r_G + + J_D[j] * w_D * r_D; + } + sycl::atomic_ref< + scalar_t, + sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[27]) += + r_G * r_G + r_D * r_D; + sycl::atomic_ref< + scalar_t, + sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[28]) += + scalar_t(1); + } + }) + .wait_and_throw(); + }); + }); + + DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count); +} + +void ComputeInformationMatrixSYCL( + const core::Tensor &target_points, + const core::Tensor &correspondence_indices, + core::Tensor &information_matrix, + const core::Dtype &dtype, + const core::Device &device) { + const int n = correspondence_indices.GetLength(); + + core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device); + + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { + scalar_t *global_sum_ptr = global_sum.GetDataPtr(); + const scalar_t *target_points_ptr = + target_points.GetDataPtr(); + const int64_t *correspondence_indices_ptr = + correspondence_indices.GetDataPtr(); + + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + const int workload_idx = id[0]; + scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; + const bool valid = GetInformationJacobians( + workload_idx, target_points_ptr, + correspondence_indices_ptr, J_x, J_y, J_z); + + if (valid) { + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; ++k) { + sycl::atomic_ref< + scalar_t, + sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + global_sum_ptr[i]) += + J_x[j] * J_x[k] + + J_y[j] * J_y[k] + + J_z[j] * J_z[k]; + ++i; + } + } + } + }) + .wait_and_throw(); + + const core::Device host(core::Device("CPU:0")); + core::Tensor global_sum_cpu = global_sum.To(host, core::Float64); + double *sum_ptr = global_sum_cpu.GetDataPtr(); + + // Information matrix is on CPU of type Float64. + double *GTG_ptr = information_matrix.GetDataPtr(); + + int i = 0; + for (int j = 0; j < 6; j++) { + for (int k = 0; k <= j; k++) { + GTG_ptr[j * 6 + k] = GTG_ptr[k * 6 + j] = sum_ptr[i]; + ++i; + } + } + }); +} + +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp index c181ade770e..5526e2f1f59 100644 --- a/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp +++ b/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp @@ -63,6 +63,13 @@ static void PoseToTransformationDevice( PoseToTransformationCUDA(transformation_ptr, pose_ptr); #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (device_type == core::Device::DeviceType::SYCL) { +#ifdef BUILD_SYCL_MODULE + PoseToTransformationSYCL(transformation_ptr, pose_ptr, + transformation.GetDevice()); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device."); @@ -109,6 +116,13 @@ static void TransformationToPoseDevice( TransformationToPoseCUDA(pose_ptr, transformation_ptr); #else utility::LogError("Not compiled with CUDA, but CUDA device is used."); +#endif + } else if (device_type == core::Device::DeviceType::SYCL) { +#ifdef BUILD_SYCL_MODULE + TransformationToPoseSYCL(pose_ptr, transformation_ptr, + pose.GetDevice()); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif } else { utility::LogError("Unimplemented device."); diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h index e75484ce3f5..02cc741d62e 100644 --- a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h +++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h @@ -77,6 +77,26 @@ void TransformationToPoseCUDA(scalar_t *pose_ptr, const scalar_t *transformation_ptr); #endif +#ifdef BUILD_SYCL_MODULE +/// \brief Helper function for PoseToTransformationSYCL. +/// Do not call this independently, as it only sets the transformation part +/// in transformation matrix, using the Pose, the rest is set in +/// the parent function PoseToTransformation. +template +void PoseToTransformationSYCL(scalar_t *transformation_ptr, + const scalar_t *pose_ptr, + const core::Device &device); + +/// \brief Helper function for TransformationToPoseSYCL. +/// Do not call this independently, as it only sets the rotation part in the +/// pose, using the Transformation, the rest is set in the parent function +/// TransformationToPose. +template +void TransformationToPoseSYCL(scalar_t *pose_ptr, + const scalar_t *transformation_ptr, + const core::Device &device); +#endif + } // namespace kernel } // namespace pipelines } // namespace t diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp new file mode 100644 index 00000000000..1ec3e943a3c --- /dev/null +++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp @@ -0,0 +1,56 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Device.h" +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/t/pipelines/kernel/TransformationConverterImpl.h" + +namespace open3d { +namespace t { +namespace pipelines { +namespace kernel { + +template +void PoseToTransformationSYCL(scalar_t *transformation_ptr, + const scalar_t *pose_ptr, + const core::Device &device) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.single_task([=]() { + PoseToTransformationImpl(transformation_ptr, pose_ptr); + }).wait_and_throw(); +} + +template +void TransformationToPoseSYCL(scalar_t *pose_ptr, + const scalar_t *transformation_ptr, + const core::Device &device) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.single_task([=]() { + TransformationToPoseImpl(pose_ptr, transformation_ptr); + }).wait_and_throw(); +} + +template void PoseToTransformationSYCL(float *, + const float *, + const core::Device &); +template void PoseToTransformationSYCL(double *, + const double *, + const core::Device &); +template void TransformationToPoseSYCL(float *, + const float *, + const core::Device &); +template void TransformationToPoseSYCL(double *, + const double *, + const core::Device &); + +} // namespace kernel +} // namespace pipelines +} // namespace t +} // namespace open3d From f547842fb1c688772ff18e59a7b1d0c2db645037 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Feb 2026 13:04:52 +0000 Subject: [PATCH 03/14] Fix RGBDOdometryImpl.h: source_normal_map -> target_normal_map in SYCL declaration Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com> --- cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h index ff854a406e2..22955e17ab6 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h @@ -129,7 +129,7 @@ void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_depth, void ComputeOdometryResultPointToPlaneSYCL( const core::Tensor& source_vertex_map, const core::Tensor& target_vertex_map, - const core::Tensor& source_normal_map, + const core::Tensor& target_normal_map, const core::Tensor& intrinsics, const core::Tensor& init_source_to_target, core::Tensor& delta, From 83aa8d94c616dae820facb36f8e2923da0c3fe18 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Feb 2026 21:49:49 +0000 Subject: [PATCH 04/14] Fix ubuntu-sycl CI: use open3d_sycl_target_sources to add -fsycl flag to SYCL kernel files Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com> --- cpp/open3d/t/pipelines/kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt index 2a9478b4be9..785d58c85e4 100644 --- a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt +++ b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt @@ -23,7 +23,7 @@ if (BUILD_CUDA_MODULE) endif() if (BUILD_SYCL_MODULE) - target_sources(tpipelines_kernel PRIVATE + open3d_sycl_target_sources(tpipelines_kernel PRIVATE RegistrationSYCL.cpp FillInLinearSystemSYCL.cpp RGBDOdometrySYCL.cpp From 7b846bb7511d2c307574d24838fd97e08e48883e Mon Sep 17 00:00:00 2001 From: Sameer Sheorey Date: Fri, 6 Mar 2026 14:19:24 -0800 Subject: [PATCH 05/14] style fix --- cpp/open3d/t/pipelines/kernel/Feature.cpp | 3 +- cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp | 173 +++--- .../t/pipelines/kernel/FillInLinearSystem.cpp | 14 +- .../t/pipelines/kernel/FillInLinearSystem.h | 54 +- .../kernel/FillInLinearSystemSYCL.cpp | 579 ++++++++---------- .../t/pipelines/kernel/RGBDOdometry.cpp | 7 +- .../t/pipelines/kernel/RGBDOdometryImpl.h | 42 +- .../t/pipelines/kernel/RGBDOdometrySYCL.cpp | 343 +++++------ .../t/pipelines/kernel/Registration.cpp | 7 +- .../t/pipelines/kernel/RegistrationImpl.h | 52 +- .../t/pipelines/kernel/RegistrationSYCL.cpp | 159 +++-- .../kernel/TransformationConverterImpl.h | 8 +- .../kernel/TransformationConverterSYCL.cpp | 8 +- 13 files changed, 663 insertions(+), 786 deletions(-) diff --git a/cpp/open3d/t/pipelines/kernel/Feature.cpp b/cpp/open3d/t/pipelines/kernel/Feature.cpp index 4677396f1d7..5a779ecc48b 100644 --- a/cpp/open3d/t/pipelines/kernel/Feature.cpp +++ b/cpp/open3d/t/pipelines/kernel/Feature.cpp @@ -51,7 +51,8 @@ void ComputeFPFHFeature( } else if (points_d.IsSYCL()) { #ifdef BUILD_SYCL_MODULE ComputeFPFHFeatureSYCL(points_d, normals_d, indices, distance2, - counts_d, fpfhs, mask, map_info_idx_to_point_idx); + counts_d, fpfhs, mask, + map_info_idx_to_point_idx); #else utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif diff --git a/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp index 7338edc63f8..8c41acf7aab 100644 --- a/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp @@ -116,109 +116,86 @@ void ComputeFPFHFeatureSYCL( map_point_idx_to_spfh_idx.GetDataPtr(); // Compute SPFH features for the points. - queue.parallel_for( - sycl::range<1>{(size_t)n_spfh}, - [=](sycl::id<1> id) { - int64_t workload_idx = id[0]; - int64_t workload_point_idx = - filter_fpfh - ? map_spfh_info_idx_to_point_idx_ptr - [workload_idx] - : workload_idx; - int64_t idx = 3 * workload_point_idx; - const scalar_t *point = points_ptr + idx; - const scalar_t *normal = normals_ptr + idx; - - const int indice_size = + queue.parallel_for(sycl::range<1>{(size_t)n_spfh}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int64_t workload_point_idx = + filter_fpfh ? map_spfh_info_idx_to_point_idx_ptr + [workload_idx] + : workload_idx; + int64_t idx = 3 * workload_point_idx; + const scalar_t *point = points_ptr + idx; + const scalar_t *normal = normals_ptr + idx; + + const int indice_size = + is_radius_search ? (counts_ptr[workload_idx + 1] - + counts_ptr[workload_idx]) + : counts_ptr[workload_idx]; + + if (indice_size > 1) { + const scalar_t hist_incr = + 100.0 / static_cast(indice_size - 1); + for (int i = 1; i < indice_size; i++) { + const int point_idx = is_radius_search - ? (counts_ptr[workload_idx + 1] - - counts_ptr[workload_idx]) - : counts_ptr[workload_idx]; - - if (indice_size > 1) { - const scalar_t hist_incr = - 100.0 / static_cast( - indice_size - 1); - for (int i = 1; i < indice_size; i++) { - const int point_idx = - is_radius_search - ? indices_ptr - [i + - counts_ptr - [workload_idx]] - : indices_ptr[workload_idx * - nn_size + - i]; - - const scalar_t *point_ref = - points_ptr + 3 * point_idx; - const scalar_t *normal_ref = - normals_ptr + 3 * point_idx; - scalar_t fea[4] = {0}; - ComputePairFeature( - point, normal, point_ref, normal_ref, - fea); - UpdateSPFHFeature( - fea, workload_idx, hist_incr, - spfhs_ptr); - } - } - }) - .wait_and_throw(); + ? indices_ptr[i + + counts_ptr[workload_idx]] + : indices_ptr[workload_idx * nn_size + + i]; + + const scalar_t *point_ref = points_ptr + 3 * point_idx; + const scalar_t *normal_ref = + normals_ptr + 3 * point_idx; + scalar_t fea[4] = {0}; + ComputePairFeature(point, normal, point_ref, + normal_ref, fea); + UpdateSPFHFeature(fea, workload_idx, + hist_incr, spfhs_ptr); + } + } + }).wait_and_throw(); // Compute FPFH features for the points. - queue.parallel_for( - sycl::range<1>{(size_t)n_fpfh}, - [=](sycl::id<1> id) { - int64_t workload_idx = id[0]; - int64_t workload_spfh_idx = - filter_fpfh - ? map_point_idx_to_spfh_idx_ptr - [map_fpfh_idx_to_point_idx_ptr - [workload_idx]] - : workload_idx; - const int indice_size = + queue.parallel_for(sycl::range<1>{(size_t)n_fpfh}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int64_t workload_spfh_idx = + filter_fpfh ? map_point_idx_to_spfh_idx_ptr + [map_fpfh_idx_to_point_idx_ptr + [workload_idx]] + : workload_idx; + const int indice_size = + is_radius_search ? (counts_ptr[workload_spfh_idx + 1] - + counts_ptr[workload_spfh_idx]) + : counts_ptr[workload_spfh_idx]; + if (indice_size > 1) { + scalar_t sum[3] = {0.0, 0.0, 0.0}; + for (int i = 1; i < indice_size; i++) { + const int idx = is_radius_search - ? (counts_ptr[workload_spfh_idx + 1] - - counts_ptr[workload_spfh_idx]) - : counts_ptr[workload_spfh_idx]; - if (indice_size > 1) { - scalar_t sum[3] = {0.0, 0.0, 0.0}; - for (int i = 1; i < indice_size; i++) { - const int idx = - is_radius_search - ? i + - counts_ptr - [workload_spfh_idx] - : workload_spfh_idx * nn_size + - i; - const scalar_t dist = distance2_ptr[idx]; - if (dist == 0.0) continue; - const int32_t spfh_idx = - filter_fpfh - ? map_point_idx_to_spfh_idx_ptr - [indices_ptr[idx]] - : indices_ptr[idx]; - for (int j = 0; j < 33; j++) { - const scalar_t val = - spfhs_ptr[spfh_idx * 33 + j] / - dist; - sum[j / 11] += val; - fpfhs_ptr[workload_idx * 33 + j] += val; - } - } - for (int j = 0; j < 3; j++) { - sum[j] = sum[j] != 0.0 ? 100.0 / sum[j] : 0.0; - } - for (int j = 0; j < 33; j++) { - fpfhs_ptr[workload_idx * 33 + j] *= - sum[j / 11]; - fpfhs_ptr[workload_idx * 33 + j] += - spfhs_ptr[workload_spfh_idx * 33 + j]; - } + ? i + counts_ptr[workload_spfh_idx] + : workload_spfh_idx * nn_size + i; + const scalar_t dist = distance2_ptr[idx]; + if (dist == 0.0) continue; + const int32_t spfh_idx = + filter_fpfh ? map_point_idx_to_spfh_idx_ptr + [indices_ptr[idx]] + : indices_ptr[idx]; + for (int j = 0; j < 33; j++) { + const scalar_t val = + spfhs_ptr[spfh_idx * 33 + j] / dist; + sum[j / 11] += val; + fpfhs_ptr[workload_idx * 33 + j] += val; } - }) - .wait_and_throw(); + } + for (int j = 0; j < 3; j++) { + sum[j] = sum[j] != 0.0 ? 100.0 / sum[j] : 0.0; + } + for (int j = 0; j < 33; j++) { + fpfhs_ptr[workload_idx * 33 + j] *= sum[j / 11]; + fpfhs_ptr[workload_idx * 33 + j] += + spfhs_ptr[workload_spfh_idx * 33 + j]; + } + } + }).wait_and_throw(); }); } diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp index 82a2c33ac93..f9e358d6636 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp @@ -134,10 +134,9 @@ void FillInSLACAlignmentTerm(core::Tensor &AtA, #endif } else if (AtA.IsSYCL()) { #ifdef BUILD_SYCL_MODULE - FillInSLACAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs, - normal_ps, Ri_normal_ps, - RjT_Ri_normal_ps, cgrid_idx_ps, - cgrid_idx_qs, cgrid_ratio_ps, + FillInSLACAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs, normal_ps, + Ri_normal_ps, RjT_Ri_normal_ps, + cgrid_idx_ps, cgrid_idx_qs, cgrid_ratio_ps, cgrid_ratio_qs, i, j, n, threshold); #else utility::LogError("Not compiled with SYCL, but SYCL device is used."); @@ -183,10 +182,9 @@ void FillInSLACRegularizerTerm(core::Tensor &AtA, #endif } else if (AtA.IsSYCL()) { #ifdef BUILD_SYCL_MODULE - FillInSLACRegularizerTermSYCL(AtA, Atb, residual, grid_idx, - grid_nbs_idx, grid_nbs_mask, - positions_init, positions_curr, weight, n, - anchor_idx); + FillInSLACRegularizerTermSYCL( + AtA, Atb, residual, grid_idx, grid_nbs_idx, grid_nbs_mask, + positions_init, positions_curr, weight, n, anchor_idx); #else utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h index 91014b84dd2..69da827ab61 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h @@ -133,43 +133,43 @@ void FillInSLACRegularizerTermCUDA(core::Tensor &AtA, #endif #ifdef BUILD_SYCL_MODULE void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, - core::Tensor &Atb, - core::Tensor &residual, - const core::Tensor &Ti_qs, - const core::Tensor &Tj_qs, - const core::Tensor &Ri_normal_ps, - int i, - int j, - float threshold); - -void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, - const core::Tensor &normal_ps, const core::Tensor &Ri_normal_ps, - const core::Tensor &RjT_Ri_normal_ps, - const core::Tensor &cgrid_idx_ps, - const core::Tensor &cgrid_idx_qs, - const core::Tensor &cgrid_ratio_qs, - const core::Tensor &cgrid_ratio_ps, int i, int j, - int n, float threshold); +void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_qs, + const core::Tensor &Tj_qs, + const core::Tensor &normal_ps, + const core::Tensor &Ri_normal_ps, + const core::Tensor &RjT_Ri_normal_ps, + const core::Tensor &cgrid_idx_ps, + const core::Tensor &cgrid_idx_qs, + const core::Tensor &cgrid_ratio_qs, + const core::Tensor &cgrid_ratio_ps, + int i, + int j, + int n, + float threshold); + void FillInSLACRegularizerTermSYCL(core::Tensor &AtA, - core::Tensor &Atb, - core::Tensor &residual, - const core::Tensor &grid_idx, - const core::Tensor &grid_nbs_idx, - const core::Tensor &grid_nbs_mask, - const core::Tensor &positions_init, - const core::Tensor &positions_curr, - float weight, - int n, - int anchor_idx); + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &grid_idx, + const core::Tensor &grid_nbs_idx, + const core::Tensor &grid_nbs_mask, + const core::Tensor &positions_init, + const core::Tensor &positions_curr, + float weight, + int n, + int anchor_idx); #endif diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp index e99b7f5ccb6..2f16deec4e8 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp @@ -21,14 +21,14 @@ namespace pipelines { namespace kernel { void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, - core::Tensor &Atb, - core::Tensor &residual, - const core::Tensor &Ti_ps, - const core::Tensor &Tj_qs, - const core::Tensor &Ri_normal_ps, - int i, - int j, - float threshold) { + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_ps, + const core::Tensor &Tj_qs, + const core::Tensor &Ri_normal_ps, + int i, + int j, + float threshold) { core::Device device = AtA.GetDevice(); int64_t n = Ti_ps.GetLength(); if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) { @@ -52,54 +52,46 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - int64_t workload_idx = id[0]; - const float *p_prime = Ti_ps_ptr + 3 * workload_idx; - const float *q_prime = Tj_qs_ptr + 3 * workload_idx; - const float *normal_p_prime = - Ri_normal_ps_ptr + 3 * workload_idx; - - float r = (p_prime[0] - q_prime[0]) * - normal_p_prime[0] + - (p_prime[1] - q_prime[1]) * - normal_p_prime[1] + - (p_prime[2] - q_prime[2]) * normal_p_prime[2]; - if (sycl::fabs(r) > threshold) return; - - float J_ij[12]; - J_ij[0] = -q_prime[2] * normal_p_prime[1] + - q_prime[1] * normal_p_prime[2]; - J_ij[1] = q_prime[2] * normal_p_prime[0] - - q_prime[0] * normal_p_prime[2]; - J_ij[2] = -q_prime[1] * normal_p_prime[0] + - q_prime[0] * normal_p_prime[1]; - J_ij[3] = normal_p_prime[0]; - J_ij[4] = normal_p_prime[1]; - J_ij[5] = normal_p_prime[2]; - for (int k = 0; k < 6; ++k) { - J_ij[k + 6] = -J_ij[k]; - } - - for (int i_local = 0; i_local < 12; ++i_local) { - for (int j_local = 0; j_local < 12; ++j_local) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - AtA_local_ptr[i_local * 12 + j_local]) += - J_ij[i_local] * J_ij[j_local]; - } - sycl::atomic_ref( - Atb_local_ptr[i_local]) += - J_ij[i_local] * r; - } + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + const float *p_prime = Ti_ps_ptr + 3 * workload_idx; + const float *q_prime = Tj_qs_ptr + 3 * workload_idx; + const float *normal_p_prime = Ri_normal_ps_ptr + 3 * workload_idx; + + float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] + + (p_prime[1] - q_prime[1]) * normal_p_prime[1] + + (p_prime[2] - q_prime[2]) * normal_p_prime[2]; + if (sycl::fabs(r) > threshold) return; + + float J_ij[12]; + J_ij[0] = -q_prime[2] * normal_p_prime[1] + + q_prime[1] * normal_p_prime[2]; + J_ij[1] = q_prime[2] * normal_p_prime[0] - + q_prime[0] * normal_p_prime[2]; + J_ij[2] = -q_prime[1] * normal_p_prime[0] + + q_prime[0] * normal_p_prime[1]; + J_ij[3] = normal_p_prime[0]; + J_ij[4] = normal_p_prime[1]; + J_ij[5] = normal_p_prime[2]; + for (int k = 0; k < 6; ++k) { + J_ij[k + 6] = -J_ij[k]; + } + + for (int i_local = 0; i_local < 12; ++i_local) { + for (int j_local = 0; j_local < 12; ++j_local) { sycl::atomic_ref( - *residual_ptr) += r * r; - }) - .wait_and_throw(); + AtA_local_ptr[i_local * 12 + j_local]) += + J_ij[i_local] * J_ij[j_local]; + } + sycl::atomic_ref( + Atb_local_ptr[i_local]) += J_ij[i_local] * r; + } + sycl::atomic_ref(*residual_ptr) += + r * r; + }).wait_and_throw(); // Then fill-in the large linear system. std::vector indices_vec(12); @@ -129,25 +121,24 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, } void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, - core::Tensor &Atb, - core::Tensor &residual, - const core::Tensor &Ti_Cps, - const core::Tensor &Tj_Cqs, - const core::Tensor &Cnormal_ps, - const core::Tensor &Ri_Cnormal_ps, - const core::Tensor &RjT_Ri_Cnormal_ps, - const core::Tensor &cgrid_idx_ps, - const core::Tensor &cgrid_idx_qs, - const core::Tensor &cgrid_ratio_qs, - const core::Tensor &cgrid_ratio_ps, - int i, - int j, - int n_frags, - float threshold) { + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &Ti_Cps, + const core::Tensor &Tj_Cqs, + const core::Tensor &Cnormal_ps, + const core::Tensor &Ri_Cnormal_ps, + const core::Tensor &RjT_Ri_Cnormal_ps, + const core::Tensor &cgrid_idx_ps, + const core::Tensor &cgrid_idx_qs, + const core::Tensor &cgrid_ratio_qs, + const core::Tensor &cgrid_ratio_ps, + int i, + int j, + int n_frags, + float threshold) { int64_t n = Ti_Cps.GetLength(); if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n || - Ri_Cnormal_ps.GetLength() != n || - RjT_Ri_Cnormal_ps.GetLength() != n || + Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n || cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n || cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) { utility::LogError( @@ -160,10 +151,8 @@ void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, float *Atb_ptr = static_cast(Atb.GetDataPtr()); float *residual_ptr = static_cast(residual.GetDataPtr()); - const float *Ti_Cps_ptr = - static_cast(Ti_Cps.GetDataPtr()); - const float *Tj_Cqs_ptr = - static_cast(Tj_Cqs.GetDataPtr()); + const float *Ti_Cps_ptr = static_cast(Ti_Cps.GetDataPtr()); + const float *Tj_Cqs_ptr = static_cast(Tj_Cqs.GetDataPtr()); const float *Cnormal_ps_ptr = static_cast(Cnormal_ps.GetDataPtr()); const float *Ri_Cnormal_ps_ptr = @@ -182,113 +171,88 @@ void FillInSLACAlignmentTermSYCL(core::Tensor &AtA, sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - int64_t workload_idx = id[0]; - const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx; - const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx; - const float *Cnormal_p = - Cnormal_ps_ptr + 3 * workload_idx; - const float *Ri_Cnormal_p = - Ri_Cnormal_ps_ptr + 3 * workload_idx; - const float *RjTRi_Cnormal_p = - RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx; - - const int *cgrid_idx_p = - cgrid_idx_ps_ptr + 8 * workload_idx; - const int *cgrid_idx_q = - cgrid_idx_qs_ptr + 8 * workload_idx; - const float *cgrid_ratio_p = - cgrid_ratio_ps_ptr + 8 * workload_idx; - const float *cgrid_ratio_q = - cgrid_ratio_qs_ptr + 8 * workload_idx; - - float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] + - (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] + - (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2]; - if (sycl::fabs(r) > threshold) return; - - float J[60]; - int idx[60]; - - J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + - Tj_Cq[1] * Ri_Cnormal_p[2]; - J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - - Tj_Cq[0] * Ri_Cnormal_p[2]; - J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + - Tj_Cq[0] * Ri_Cnormal_p[1]; - J[3] = Ri_Cnormal_p[0]; - J[4] = Ri_Cnormal_p[1]; - J[5] = Ri_Cnormal_p[2]; - - for (int k = 0; k < 6; ++k) { - J[k + 6] = -J[k]; - idx[k + 0] = 6 * i + k; - idx[k + 6] = 6 * j + k; - } - - for (int k = 0; k < 8; ++k) { - J[12 + k * 3 + 0] = - cgrid_ratio_p[k] * Cnormal_p[0]; - J[12 + k * 3 + 1] = - cgrid_ratio_p[k] * Cnormal_p[1]; - J[12 + k * 3 + 2] = - cgrid_ratio_p[k] * Cnormal_p[2]; - idx[12 + k * 3 + 0] = - 6 * n_frags + cgrid_idx_p[k] * 3 + 0; - idx[12 + k * 3 + 1] = - 6 * n_frags + cgrid_idx_p[k] * 3 + 1; - idx[12 + k * 3 + 2] = - 6 * n_frags + cgrid_idx_p[k] * 3 + 2; - } - - for (int k = 0; k < 8; ++k) { - J[36 + k * 3 + 0] = - -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0]; - J[36 + k * 3 + 1] = - -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1]; - J[36 + k * 3 + 2] = - -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2]; - idx[36 + k * 3 + 0] = - 6 * n_frags + cgrid_idx_q[k] * 3 + 0; - idx[36 + k * 3 + 1] = - 6 * n_frags + cgrid_idx_q[k] * 3 + 1; - idx[36 + k * 3 + 2] = - 6 * n_frags + cgrid_idx_q[k] * 3 + 2; - } - - for (int ki = 0; ki < 60; ++ki) { - for (int kj = 0; kj < 60; ++kj) { - float AtA_ij = J[ki] * J[kj]; - int ij = idx[ki] * n_vars + idx[kj]; - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - AtA_ptr[ij]) += AtA_ij; - } - sycl::atomic_ref( - Atb_ptr[idx[ki]]) += J[ki] * r; - } + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx; + const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx; + const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx; + const float *Ri_Cnormal_p = Ri_Cnormal_ps_ptr + 3 * workload_idx; + const float *RjTRi_Cnormal_p = + RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx; + + const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx; + const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx; + const float *cgrid_ratio_p = cgrid_ratio_ps_ptr + 8 * workload_idx; + const float *cgrid_ratio_q = cgrid_ratio_qs_ptr + 8 * workload_idx; + + float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] + + (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] + + (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2]; + if (sycl::fabs(r) > threshold) return; + + float J[60]; + int idx[60]; + + J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2]; + J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2]; + J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1]; + J[3] = Ri_Cnormal_p[0]; + J[4] = Ri_Cnormal_p[1]; + J[5] = Ri_Cnormal_p[2]; + + for (int k = 0; k < 6; ++k) { + J[k + 6] = -J[k]; + idx[k + 0] = 6 * i + k; + idx[k + 6] = 6 * j + k; + } + + for (int k = 0; k < 8; ++k) { + J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0]; + J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1]; + J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2]; + idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0; + idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1; + idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2; + } + + for (int k = 0; k < 8; ++k) { + J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0]; + J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1]; + J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2]; + idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0; + idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1; + idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2; + } + + for (int ki = 0; ki < 60; ++ki) { + for (int kj = 0; kj < 60; ++kj) { + float AtA_ij = J[ki] * J[kj]; + int ij = idx[ki] * n_vars + idx[kj]; sycl::atomic_ref( - *residual_ptr) += r * r; - }) - .wait_and_throw(); + AtA_ptr[ij]) += AtA_ij; + } + sycl::atomic_ref( + Atb_ptr[idx[ki]]) += J[ki] * r; + } + sycl::atomic_ref(*residual_ptr) += + r * r; + }).wait_and_throw(); } void FillInSLACRegularizerTermSYCL(core::Tensor &AtA, - core::Tensor &Atb, - core::Tensor &residual, - const core::Tensor &grid_idx, - const core::Tensor &grid_nbs_idx, - const core::Tensor &grid_nbs_mask, - const core::Tensor &positions_init, - const core::Tensor &positions_curr, - float weight, - int n_frags, - int anchor_idx) { + core::Tensor &Atb, + core::Tensor &residual, + const core::Tensor &grid_idx, + const core::Tensor &grid_nbs_idx, + const core::Tensor &grid_nbs_mask, + const core::Tensor &positions_init, + const core::Tensor &positions_curr, + float weight, + int n_frags, + int anchor_idx) { int64_t n = grid_idx.GetLength(); int64_t n_vars = Atb.GetLength(); @@ -297,8 +261,7 @@ void FillInSLACRegularizerTermSYCL(core::Tensor &AtA, float *Atb_ptr = static_cast(Atb.GetDataPtr()); float *residual_ptr = static_cast(residual.GetDataPtr()); - const int *grid_idx_ptr = - static_cast(grid_idx.GetDataPtr()); + const int *grid_idx_ptr = static_cast(grid_idx.GetDataPtr()); const int *grid_nbs_idx_ptr = static_cast(grid_nbs_idx.GetDataPtr()); const bool *grid_nbs_mask_ptr = @@ -310,144 +273,130 @@ void FillInSLACRegularizerTermSYCL(core::Tensor &AtA, sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - int64_t workload_idx = id[0]; - int idx_i = grid_idx_ptr[workload_idx]; - - const int *idx_nbs = - grid_nbs_idx_ptr + 6 * workload_idx; - const bool *mask_nbs = - grid_nbs_mask_ptr + 6 * workload_idx; - - float cov[3][3] = {{0}}; - float U[3][3], V[3][3], S[3]; - - int cnt = 0; - for (int k = 0; k < 6; ++k) { - if (!mask_nbs[k]) continue; - int idx_k = idx_nbs[k]; - - float diff_ik_init[3] = { - positions_init_ptr[idx_i * 3 + 0] - - positions_init_ptr[idx_k * 3 + 0], - positions_init_ptr[idx_i * 3 + 1] - - positions_init_ptr[idx_k * 3 + 1], - positions_init_ptr[idx_i * 3 + 2] - - positions_init_ptr[idx_k * 3 + 2]}; - float diff_ik_curr[3] = { - positions_curr_ptr[idx_i * 3 + 0] - - positions_curr_ptr[idx_k * 3 + 0], - positions_curr_ptr[idx_i * 3 + 1] - - positions_curr_ptr[idx_k * 3 + 1], - positions_curr_ptr[idx_i * 3 + 2] - - positions_curr_ptr[idx_k * 3 + 2]}; - - for (int ii = 0; ii < 3; ++ii) { - for (int jj = 0; jj < 3; ++jj) { - cov[ii][jj] += diff_ik_init[ii] * - diff_ik_curr[jj]; - } - } - ++cnt; - } - - if (cnt < 3) return; - - core::linalg::kernel::svd3x3(*cov, *U, S, *V); - - float R[3][3]; - core::linalg::kernel::transpose3x3_(*U); - core::linalg::kernel::matmul3x3_3x3(*V, *U, *R); - - float d = core::linalg::kernel::det3x3(*R); - if (d < 0) { - U[2][0] = -U[2][0]; - U[2][1] = -U[2][1]; - U[2][2] = -U[2][2]; - core::linalg::kernel::matmul3x3_3x3(*V, *U, *R); - } - - if (idx_i == anchor_idx) { - R[0][0] = R[1][1] = R[2][2] = 1; - R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = - R[2][1] = 0; + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + int64_t workload_idx = id[0]; + int idx_i = grid_idx_ptr[workload_idx]; + + const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx; + const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx; + + float cov[3][3] = {{0}}; + float U[3][3], V[3][3], S[3]; + + int cnt = 0; + for (int k = 0; k < 6; ++k) { + if (!mask_nbs[k]) continue; + int idx_k = idx_nbs[k]; + + float diff_ik_init[3] = { + positions_init_ptr[idx_i * 3 + 0] - + positions_init_ptr[idx_k * 3 + 0], + positions_init_ptr[idx_i * 3 + 1] - + positions_init_ptr[idx_k * 3 + 1], + positions_init_ptr[idx_i * 3 + 2] - + positions_init_ptr[idx_k * 3 + 2]}; + float diff_ik_curr[3] = { + positions_curr_ptr[idx_i * 3 + 0] - + positions_curr_ptr[idx_k * 3 + 0], + positions_curr_ptr[idx_i * 3 + 1] - + positions_curr_ptr[idx_k * 3 + 1], + positions_curr_ptr[idx_i * 3 + 2] - + positions_curr_ptr[idx_k * 3 + 2]}; + + for (int ii = 0; ii < 3; ++ii) { + for (int jj = 0; jj < 3; ++jj) { + cov[ii][jj] += diff_ik_init[ii] * diff_ik_curr[jj]; } + } + ++cnt; + } + + if (cnt < 3) return; + + core::linalg::kernel::svd3x3(*cov, *U, S, *V); + + float R[3][3]; + core::linalg::kernel::transpose3x3_(*U); + core::linalg::kernel::matmul3x3_3x3(*V, *U, *R); + + float d = core::linalg::kernel::det3x3(*R); + if (d < 0) { + U[2][0] = -U[2][0]; + U[2][1] = -U[2][1]; + U[2][2] = -U[2][2]; + core::linalg::kernel::matmul3x3_3x3(*V, *U, *R); + } + + if (idx_i == anchor_idx) { + R[0][0] = R[1][1] = R[2][2] = 1; + R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] = 0; + } + + for (int k = 0; k < 6; ++k) { + if (!mask_nbs[k]) continue; + int idx_k = idx_nbs[k]; + + float diff_ik_init[3] = { + positions_init_ptr[idx_i * 3 + 0] - + positions_init_ptr[idx_k * 3 + 0], + positions_init_ptr[idx_i * 3 + 1] - + positions_init_ptr[idx_k * 3 + 1], + positions_init_ptr[idx_i * 3 + 2] - + positions_init_ptr[idx_k * 3 + 2]}; + float diff_ik_curr[3] = { + positions_curr_ptr[idx_i * 3 + 0] - + positions_curr_ptr[idx_k * 3 + 0], + positions_curr_ptr[idx_i * 3 + 1] - + positions_curr_ptr[idx_k * 3 + 1], + positions_curr_ptr[idx_i * 3 + 2] - + positions_curr_ptr[idx_k * 3 + 2]}; + float R_diff_ik_curr[3]; + core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init, + R_diff_ik_curr); + + float local_r[3]; + local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0]; + local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1]; + local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2]; + + int offset_idx_i = 3 * idx_i + 6 * n_frags; + int offset_idx_k = 3 * idx_k + 6 * n_frags; + + sycl::atomic_ref(*residual_ptr) += + weight * + (local_r[0] * local_r[0] + local_r[1] * local_r[1] + + local_r[2] * local_r[2]); + + for (int axis = 0; axis < 3; ++axis) { + sycl::atomic_ref( + AtA_ptr[(offset_idx_i + axis) * n_vars + + offset_idx_i + axis]) += weight; + sycl::atomic_ref( + AtA_ptr[(offset_idx_k + axis) * n_vars + + offset_idx_k + axis]) += weight; + sycl::atomic_ref( + AtA_ptr[(offset_idx_i + axis) * n_vars + + offset_idx_k + axis]) -= weight; + sycl::atomic_ref( + AtA_ptr[(offset_idx_k + axis) * n_vars + + offset_idx_i + axis]) -= weight; - for (int k = 0; k < 6; ++k) { - if (!mask_nbs[k]) continue; - int idx_k = idx_nbs[k]; - - float diff_ik_init[3] = { - positions_init_ptr[idx_i * 3 + 0] - - positions_init_ptr[idx_k * 3 + 0], - positions_init_ptr[idx_i * 3 + 1] - - positions_init_ptr[idx_k * 3 + 1], - positions_init_ptr[idx_i * 3 + 2] - - positions_init_ptr[idx_k * 3 + 2]}; - float diff_ik_curr[3] = { - positions_curr_ptr[idx_i * 3 + 0] - - positions_curr_ptr[idx_k * 3 + 0], - positions_curr_ptr[idx_i * 3 + 1] - - positions_curr_ptr[idx_k * 3 + 1], - positions_curr_ptr[idx_i * 3 + 2] - - positions_curr_ptr[idx_k * 3 + 2]}; - float R_diff_ik_curr[3]; - core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init, - R_diff_ik_curr); - - float local_r[3]; - local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0]; - local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1]; - local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2]; - - int offset_idx_i = 3 * idx_i + 6 * n_frags; - int offset_idx_k = 3 * idx_k + 6 * n_frags; - - sycl::atomic_ref( - *residual_ptr) += - weight * (local_r[0] * local_r[0] + - local_r[1] * local_r[1] + - local_r[2] * local_r[2]); - - for (int axis = 0; axis < 3; ++axis) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - AtA_ptr[(offset_idx_i + axis) * n_vars + - offset_idx_i + axis]) += weight; - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - AtA_ptr[(offset_idx_k + axis) * n_vars + - offset_idx_k + axis]) += weight; - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - AtA_ptr[(offset_idx_i + axis) * n_vars + - offset_idx_k + axis]) -= weight; - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - AtA_ptr[(offset_idx_k + axis) * n_vars + - offset_idx_i + axis]) -= weight; - - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - Atb_ptr[offset_idx_i + axis]) += - weight * local_r[axis]; - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - Atb_ptr[offset_idx_k + axis]) -= - weight * local_r[axis]; - } - } - }) - .wait_and_throw(); + sycl::atomic_ref( + Atb_ptr[offset_idx_i + axis]) += + weight * local_r[axis]; + sycl::atomic_ref( + Atb_ptr[offset_idx_k + axis]) -= + weight * local_r[axis]; + } + } + }).wait_and_throw(); } } // namespace kernel diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp index 6b54fb41961..4a9afaff45f 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp @@ -254,10 +254,9 @@ void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map, square_dist_thr, information); } else if (device.GetType() == core::Device::DeviceType::SYCL) { #ifdef BUILD_SYCL_MODULE - ComputeOdometryInformationMatrixSYCL(source_vertex_map, - target_vertex_map, intrinsic, - source_to_target, square_dist_thr, - information); + ComputeOdometryInformationMatrixSYCL( + source_vertex_map, target_vertex_map, intrinsic, + source_to_target, square_dist_thr, information); #else utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h index 22955e17ab6..3c65eec81e8 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h @@ -155,29 +155,29 @@ void ComputeOdometryResultIntensitySYCL( const float intensity_huber_delta); void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth, - const core::Tensor& target_depth, - const core::Tensor& source_intensity, - const core::Tensor& target_intensity, - const core::Tensor& target_depth_dx, - const core::Tensor& target_depth_dy, - const core::Tensor& target_intensity_dx, - const core::Tensor& target_intensity_dy, - const core::Tensor& source_vertex_map, - const core::Tensor& intrinsics, - const core::Tensor& init_source_to_target, - core::Tensor& delta, - float& inlier_residual, - int& inlier_count, - const float depth_outlier_trunc, - const float depth_huber_delta, - const float intensity_huber_delta); + const core::Tensor& target_depth, + const core::Tensor& source_intensity, + const core::Tensor& target_intensity, + const core::Tensor& target_depth_dx, + const core::Tensor& target_depth_dy, + const core::Tensor& target_intensity_dx, + const core::Tensor& target_intensity_dy, + const core::Tensor& source_vertex_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta, + const float intensity_huber_delta); void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_depth, - const core::Tensor& target_depth, - const core::Tensor& intrinsic, - const core::Tensor& source_to_target, - const float square_dist_thr, - core::Tensor& information); + const core::Tensor& target_depth, + const core::Tensor& intrinsic, + const core::Tensor& source_to_target, + const float square_dist_thr, + core::Tensor& information); #endif } // namespace odometry diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp index f48468763f1..d49ed13b4b1 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp @@ -54,52 +54,42 @@ void ComputeOdometryResultPointToPlaneSYCL( sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - const int workload_idx = id[0]; - const int y = workload_idx / cols; - const int x = workload_idx % cols; - - float J[6] = {0}; - float r = 0; - const bool valid = GetJacobianPointToPlane( - x, y, depth_outlier_trunc, source_vertex_indexer, - target_vertex_indexer, target_normal_indexer, ti, - J, r); - - if (valid) { - const float d_huber = - HuberDeriv(r, depth_huber_delta); - const float r_huber = - HuberLoss(r, depth_huber_delta); - - int offset = 0; - for (int i = 0; i < 6; ++i) { - for (int j = 0; j <= i; ++j) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[offset++]) += - J[i] * J[j]; - } - } - for (int i = 0; i < 6; ++i) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[21 + i]) += - J[i] * d_huber; - } + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + const int workload_idx = id[0]; + const int y = workload_idx / cols; + const int x = workload_idx % cols; + + float J[6] = {0}; + float r = 0; + const bool valid = GetJacobianPointToPlane( + x, y, depth_outlier_trunc, source_vertex_indexer, + target_vertex_indexer, target_normal_indexer, ti, J, r); + + if (valid) { + const float d_huber = HuberDeriv(r, depth_huber_delta); + const float r_huber = HuberLoss(r, depth_huber_delta); + + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { sycl::atomic_ref( - global_sum_ptr[27]) += r_huber; - sycl::atomic_ref( - global_sum_ptr[28]) += 1.0f; + global_sum_ptr[offset++]) += J[i] * J[j]; } - }) - .wait_and_throw(); + } + for (int i = 0; i < 6; ++i) { + sycl::atomic_ref( + global_sum_ptr[21 + i]) += J[i] * d_huber; + } + sycl::atomic_ref( + global_sum_ptr[27]) += r_huber; + sycl::atomic_ref( + global_sum_ptr[28]) += 1.0f; + } + }).wait_and_throw(); DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); } @@ -141,77 +131,66 @@ void ComputeOdometryResultIntensitySYCL( sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - const int workload_idx = id[0]; - const int y = workload_idx / cols; - const int x = workload_idx % cols; - - float J_I[6] = {0}; - float r_I = 0; - const bool valid = GetJacobianIntensity( - x, y, depth_outlier_trunc, source_depth_indexer, - target_depth_indexer, source_intensity_indexer, - target_intensity_indexer, - target_intensity_dx_indexer, - target_intensity_dy_indexer, source_vertex_indexer, - ti, J_I, r_I); - - if (valid) { - const float d_huber = - HuberDeriv(r_I, intensity_huber_delta); - const float r_huber = - HuberLoss(r_I, intensity_huber_delta); - - int offset = 0; - for (int i = 0; i < 6; ++i) { - for (int j = 0; j <= i; ++j) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[offset++]) += - J_I[i] * J_I[j]; - } - } - for (int i = 0; i < 6; ++i) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[21 + i]) += - J_I[i] * d_huber; - } - sycl::atomic_ref( - global_sum_ptr[27]) += r_huber; + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + const int workload_idx = id[0]; + const int y = workload_idx / cols; + const int x = workload_idx % cols; + + float J_I[6] = {0}; + float r_I = 0; + const bool valid = GetJacobianIntensity( + x, y, depth_outlier_trunc, source_depth_indexer, + target_depth_indexer, source_intensity_indexer, + target_intensity_indexer, target_intensity_dx_indexer, + target_intensity_dy_indexer, source_vertex_indexer, ti, + J_I, r_I); + + if (valid) { + const float d_huber = HuberDeriv(r_I, intensity_huber_delta); + const float r_huber = HuberLoss(r_I, intensity_huber_delta); + + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { sycl::atomic_ref( - global_sum_ptr[28]) += 1.0f; + global_sum_ptr[offset++]) += J_I[i] * J_I[j]; } - }) - .wait_and_throw(); + } + for (int i = 0; i < 6; ++i) { + sycl::atomic_ref( + global_sum_ptr[21 + i]) += J_I[i] * d_huber; + } + sycl::atomic_ref( + global_sum_ptr[27]) += r_huber; + sycl::atomic_ref( + global_sum_ptr[28]) += 1.0f; + } + }).wait_and_throw(); DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); } -void ComputeOdometryResultHybridSYCL( - const core::Tensor& source_depth, - const core::Tensor& target_depth, - const core::Tensor& source_intensity, - const core::Tensor& target_intensity, - const core::Tensor& target_depth_dx, - const core::Tensor& target_depth_dy, - const core::Tensor& target_intensity_dx, - const core::Tensor& target_intensity_dy, - const core::Tensor& source_vertex_map, - const core::Tensor& intrinsics, - const core::Tensor& init_source_to_target, - core::Tensor& delta, - float& inlier_residual, - int& inlier_count, - const float depth_outlier_trunc, - const float depth_huber_delta, - const float intensity_huber_delta) { +void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth, + const core::Tensor& target_depth, + const core::Tensor& source_intensity, + const core::Tensor& target_intensity, + const core::Tensor& target_depth_dx, + const core::Tensor& target_depth_dy, + const core::Tensor& target_intensity_dx, + const core::Tensor& target_intensity_dy, + const core::Tensor& source_vertex_map, + const core::Tensor& intrinsics, + const core::Tensor& init_source_to_target, + core::Tensor& delta, + float& inlier_residual, + int& inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta, + const float intensity_huber_delta) { NDArrayIndexer source_depth_indexer(source_depth, 2); NDArrayIndexer target_depth_indexer(target_depth, 2); NDArrayIndexer source_intensity_indexer(source_intensity, 2); @@ -236,67 +215,58 @@ void ComputeOdometryResultHybridSYCL( sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - const int workload_idx = id[0]; - const int y = workload_idx / cols; - const int x = workload_idx % cols; - - float J_I[6] = {0}, J_D[6] = {0}; - float r_I = 0, r_D = 0; - const bool valid = GetJacobianHybrid( - x, y, depth_outlier_trunc, source_depth_indexer, - target_depth_indexer, source_intensity_indexer, - target_intensity_indexer, target_depth_dx_indexer, - target_depth_dy_indexer, - target_intensity_dx_indexer, - target_intensity_dy_indexer, source_vertex_indexer, - ti, J_I, J_D, r_I, r_D); - - if (valid) { - int offset = 0; - for (int i = 0; i < 6; ++i) { - for (int j = 0; j <= i; ++j) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[offset++]) += - J_I[i] * J_I[j] + J_D[i] * J_D[j]; - } - } - for (int i = 0; i < 6; ++i) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[21 + i]) += - J_I[i] * - HuberDeriv(r_I, - intensity_huber_delta) + - J_D[i] * HuberDeriv(r_D, depth_huber_delta); - } + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + const int workload_idx = id[0]; + const int y = workload_idx / cols; + const int x = workload_idx % cols; + + float J_I[6] = {0}, J_D[6] = {0}; + float r_I = 0, r_D = 0; + const bool valid = GetJacobianHybrid( + x, y, depth_outlier_trunc, source_depth_indexer, + target_depth_indexer, source_intensity_indexer, + target_intensity_indexer, target_depth_dx_indexer, + target_depth_dy_indexer, target_intensity_dx_indexer, + target_intensity_dy_indexer, source_vertex_indexer, ti, + J_I, J_D, r_I, r_D); + + if (valid) { + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { sycl::atomic_ref( - global_sum_ptr[27]) += - HuberLoss(r_I, intensity_huber_delta) + - HuberLoss(r_D, depth_huber_delta); - sycl::atomic_ref( - global_sum_ptr[28]) += 1.0f; + global_sum_ptr[offset++]) += + J_I[i] * J_I[j] + J_D[i] * J_D[j]; } - }) - .wait_and_throw(); + } + for (int i = 0; i < 6; ++i) { + sycl::atomic_ref( + global_sum_ptr[21 + i]) += + J_I[i] * HuberDeriv(r_I, intensity_huber_delta) + + J_D[i] * HuberDeriv(r_D, depth_huber_delta); + } + sycl::atomic_ref( + global_sum_ptr[27]) += + HuberLoss(r_I, intensity_huber_delta) + + HuberLoss(r_D, depth_huber_delta); + sycl::atomic_ref( + global_sum_ptr[28]) += 1.0f; + } + }).wait_and_throw(); DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); } -void ComputeOdometryInformationMatrixSYCL( - const core::Tensor& source_vertex_map, - const core::Tensor& target_vertex_map, - const core::Tensor& intrinsic, - const core::Tensor& source_to_target, - const float square_dist_thr, - core::Tensor& information) { +void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_vertex_map, + const core::Tensor& target_vertex_map, + const core::Tensor& intrinsic, + const core::Tensor& source_to_target, + const float square_dist_thr, + core::Tensor& information) { NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); NDArrayIndexer target_vertex_indexer(target_vertex_map, 2); @@ -314,35 +284,30 @@ void ComputeOdometryInformationMatrixSYCL( sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - const int workload_idx = id[0]; - const int y = workload_idx / cols; - const int x = workload_idx % cols; - - float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; - float rx = 0, ry = 0, rz = 0; - const bool valid = GetJacobianPointToPoint( - x, y, square_dist_thr, source_vertex_indexer, - target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, - rz); - - if (valid) { - int offset = 0; - for (int i = 0; i < 6; ++i) { - for (int j = 0; j <= i; ++j) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[offset++]) += - J_x[i] * J_x[j] + J_y[i] * J_y[j] + - J_z[i] * J_z[j]; - } - } + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + const int workload_idx = id[0]; + const int y = workload_idx / cols; + const int x = workload_idx % cols; + + float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; + float rx = 0, ry = 0, rz = 0; + const bool valid = GetJacobianPointToPoint( + x, y, square_dist_thr, source_vertex_indexer, + target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, rz); + + if (valid) { + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + sycl::atomic_ref( + global_sum_ptr[offset++]) += J_x[i] * J_x[j] + + J_y[i] * J_y[j] + + J_z[i] * J_z[j]; } - }) - .wait_and_throw(); + } + } + }).wait_and_throw(); const core::Device host(core::Device("CPU:0")); information = core::Tensor::Empty({6, 6}, core::Float64, host); diff --git a/cpp/open3d/t/pipelines/kernel/Registration.cpp b/cpp/open3d/t/pipelines/kernel/Registration.cpp index b3ebf4f5508..95ccddec5c2 100644 --- a/cpp/open3d/t/pipelines/kernel/Registration.cpp +++ b/cpp/open3d/t/pipelines/kernel/Registration.cpp @@ -378,10 +378,9 @@ core::Tensor ComputeInformationMatrix( target_points.GetDtype(), device); } else if (target_points.IsSYCL()) { #ifdef BUILD_SYCL_MODULE - ComputeInformationMatrixSYCL(target_points.Contiguous(), - correspondence_indices.Contiguous(), - information_matrix, - target_points.GetDtype(), device); + ComputeInformationMatrixSYCL( + target_points.Contiguous(), correspondence_indices.Contiguous(), + information_matrix, target_points.GetDtype(), device); #else utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h index f3b65414f88..94acdf773b3 100644 --- a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h @@ -151,30 +151,30 @@ void ComputeInformationMatrixCUDA(const core::Tensor &target_points, #ifdef BUILD_SYCL_MODULE void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, - const core::Tensor &target_points, - const core::Tensor &target_normals, - const core::Tensor &correspondence_indices, - core::Tensor &pose, - float &residual, - int &inlier_count, - const core::Dtype &dtype, - const core::Device &device, - const registration::RobustKernel &kernel); + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel); void ComputePoseColoredICPSYCL(const core::Tensor &source_points, - const core::Tensor &source_colors, - const core::Tensor &target_points, - const core::Tensor &target_normals, - const core::Tensor &target_colors, - const core::Tensor &target_color_gradients, - const core::Tensor &correspondence_indices, - core::Tensor &pose, - float &residual, - int &inlier_count, - const core::Dtype &dtype, - const core::Device &device, - const registration::RobustKernel &kernel, - const double &lambda_geometric); + const core::Tensor &source_colors, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &target_colors, + const core::Tensor &target_color_gradients, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel, + const double &lambda_geometric); void ComputePoseDopplerICPSYCL( const core::Tensor &source_points, @@ -200,10 +200,10 @@ void ComputePoseDopplerICPSYCL( const double lambda_doppler); void ComputeInformationMatrixSYCL(const core::Tensor &target_points, - const core::Tensor &correspondence_indices, - core::Tensor &information_matrix, - const core::Dtype &dtype, - const core::Device &device); + const core::Tensor &correspondence_indices, + core::Tensor &information_matrix, + const core::Dtype &dtype, + const core::Device &device); #endif template diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp index 17bf9768f93..b44c696e7e4 100644 --- a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp @@ -18,19 +18,19 @@ namespace t { namespace pipelines { namespace kernel { -static constexpr int kReduceDim = 29; // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier) - -void ComputePosePointToPlaneSYCL( - const core::Tensor &source_points, - const core::Tensor &target_points, - const core::Tensor &target_normals, - const core::Tensor &correspondence_indices, - core::Tensor &pose, - float &residual, - int &inlier_count, - const core::Dtype &dtype, - const core::Device &device, - const registration::RobustKernel &kernel) { +static constexpr int kReduceDim = + 29; // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier) + +void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel) { const int n = source_points.GetLength(); core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); @@ -113,21 +113,20 @@ void ComputePosePointToPlaneSYCL( DecodeAndSolve6x6(global_sum, pose, residual, inlier_count); } -void ComputePoseColoredICPSYCL( - const core::Tensor &source_points, - const core::Tensor &source_colors, - const core::Tensor &target_points, - const core::Tensor &target_normals, - const core::Tensor &target_colors, - const core::Tensor &target_color_gradients, - const core::Tensor &correspondence_indices, - core::Tensor &pose, - float &residual, - int &inlier_count, - const core::Dtype &dtype, - const core::Device &device, - const registration::RobustKernel &kernel, - const double &lambda_geometric) { +void ComputePoseColoredICPSYCL(const core::Tensor &source_points, + const core::Tensor &source_colors, + const core::Tensor &target_points, + const core::Tensor &target_normals, + const core::Tensor &target_colors, + const core::Tensor &target_color_gradients, + const core::Tensor &correspondence_indices, + core::Tensor &pose, + float &residual, + int &inlier_count, + const core::Dtype &dtype, + const core::Device &device, + const registration::RobustKernel &kernel, + const double &lambda_geometric) { const int n = source_points.GetLength(); core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); @@ -197,8 +196,7 @@ void ComputePoseColoredICPSYCL( sycl::memory_scope:: device>( global_sum_ptr[i]) += - J_G[j] * w_G * - J_G[k] + + J_G[j] * w_G * J_G[k] + J_I[j] * w_I * J_I[k]; ++i; } @@ -280,8 +278,8 @@ void ComputePoseDopplerICPSYCL( scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr(); queue.single_task([=]() { PreComputeForDopplerICP(R_S_to_V_ptr, r_v_to_s_in_V_ptr, - w_v_in_V_ptr, v_v_in_V_ptr, - v_s_in_S_ptr); + w_v_in_V_ptr, v_v_in_V_ptr, + v_s_in_S_ptr); }).wait_and_throw(); } @@ -317,25 +315,23 @@ void ComputePoseDopplerICPSYCL( scalar_t J_G[6] = {0}, J_D[6] = {0}; scalar_t r_G = 0, r_D = 0; - const bool valid = - GetJacobianDopplerICP( - workload_idx, - source_points_ptr, - source_dopplers_ptr, - source_directions_ptr, - target_points_ptr, - target_normals_ptr, - correspondence_indices_ptr, - R_S_to_V_ptr, - r_v_to_s_in_V_ptr, - v_s_in_S_ptr, - reject_dynamic_outliers, - static_cast( - doppler_outlier_threshold), - sqrt_lambda_geometric, - sqrt_lambda_doppler, - sqrt_lambda_doppler_by_dt, - J_G, J_D, r_G, r_D); + const bool valid = GetJacobianDopplerICP< + scalar_t>( + workload_idx, source_points_ptr, + source_dopplers_ptr, + source_directions_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + R_S_to_V_ptr, r_v_to_s_in_V_ptr, + v_s_in_S_ptr, + reject_dynamic_outliers, + static_cast( + doppler_outlier_threshold), + sqrt_lambda_geometric, + sqrt_lambda_doppler, + sqrt_lambda_doppler_by_dt, J_G, + J_D, r_G, r_D); if (valid) { const scalar_t w_G = @@ -355,8 +351,7 @@ void ComputePoseDopplerICPSYCL( sycl::memory_scope:: device>( global_sum_ptr[i]) += - J_G[j] * w_G * - J_G[k] + + J_G[j] * w_G * J_G[k] + J_D[j] * w_D * J_D[k]; ++i; } @@ -391,12 +386,11 @@ void ComputePoseDopplerICPSYCL( DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count); } -void ComputeInformationMatrixSYCL( - const core::Tensor &target_points, - const core::Tensor &correspondence_indices, - core::Tensor &information_matrix, - const core::Dtype &dtype, - const core::Device &device) { +void ComputeInformationMatrixSYCL(const core::Tensor &target_points, + const core::Tensor &correspondence_indices, + core::Tensor &information_matrix, + const core::Dtype &dtype, + const core::Device &device) { const int n = correspondence_indices.GetLength(); core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device); @@ -411,33 +405,28 @@ void ComputeInformationMatrixSYCL( const int64_t *correspondence_indices_ptr = correspondence_indices.GetDataPtr(); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - const int workload_idx = id[0]; - scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; - const bool valid = GetInformationJacobians( - workload_idx, target_points_ptr, - correspondence_indices_ptr, J_x, J_y, J_z); - - if (valid) { - int i = 0; - for (int j = 0; j < 6; ++j) { - for (int k = 0; k <= j; ++k) { - sycl::atomic_ref< - scalar_t, - sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[i]) += - J_x[j] * J_x[k] + - J_y[j] * J_y[k] + - J_z[j] * J_z[k]; - ++i; - } - } + queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { + const int workload_idx = id[0]; + scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; + const bool valid = GetInformationJacobians( + workload_idx, target_points_ptr, + correspondence_indices_ptr, J_x, J_y, J_z); + + if (valid) { + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; ++k) { + sycl::atomic_ref( + global_sum_ptr[i]) += J_x[j] * J_x[k] + + J_y[j] * J_y[k] + + J_z[j] * J_z[k]; + ++i; } - }) - .wait_and_throw(); + } + } + }).wait_and_throw(); const core::Device host(core::Device("CPU:0")); core::Tensor global_sum_cpu = global_sum.To(host, core::Float64); diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h index 02cc741d62e..6ee52ee8ab5 100644 --- a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h +++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h @@ -84,8 +84,8 @@ void TransformationToPoseCUDA(scalar_t *pose_ptr, /// the parent function PoseToTransformation. template void PoseToTransformationSYCL(scalar_t *transformation_ptr, - const scalar_t *pose_ptr, - const core::Device &device); + const scalar_t *pose_ptr, + const core::Device &device); /// \brief Helper function for TransformationToPoseSYCL. /// Do not call this independently, as it only sets the rotation part in the @@ -93,8 +93,8 @@ void PoseToTransformationSYCL(scalar_t *transformation_ptr, /// TransformationToPose. template void TransformationToPoseSYCL(scalar_t *pose_ptr, - const scalar_t *transformation_ptr, - const core::Device &device); + const scalar_t *transformation_ptr, + const core::Device &device); #endif } // namespace kernel diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp index 1ec3e943a3c..13eb4490465 100644 --- a/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp @@ -17,8 +17,8 @@ namespace kernel { template void PoseToTransformationSYCL(scalar_t *transformation_ptr, - const scalar_t *pose_ptr, - const core::Device &device) { + const scalar_t *pose_ptr, + const core::Device &device) { sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); queue.single_task([=]() { @@ -28,8 +28,8 @@ void PoseToTransformationSYCL(scalar_t *transformation_ptr, template void TransformationToPoseSYCL(scalar_t *pose_ptr, - const scalar_t *transformation_ptr, - const core::Device &device) { + const scalar_t *transformation_ptr, + const core::Device &device) { sycl::queue queue = core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); queue.single_task([=]() { From 5c3b28c3cf3f2b76da8705105d0bffad7a01902c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:41:01 +0000 Subject: [PATCH 06/14] Update t/pipelines tests to use PermuteDevicesWithSYCL for SYCL device coverage Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com> --- cpp/tests/t/pipelines/TransformationConverter.cpp | 4 ++-- cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp | 4 ++-- cpp/tests/t/pipelines/registration/Feature.cpp | 4 ++-- cpp/tests/t/pipelines/registration/Registration.cpp | 4 ++-- .../t/pipelines/registration/TransformationEstimation.cpp | 4 ++-- cpp/tests/t/pipelines/slac/ControlGrid.cpp | 4 ++-- cpp/tests/t/pipelines/slac/SLAC.cpp | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/tests/t/pipelines/TransformationConverter.cpp b/cpp/tests/t/pipelines/TransformationConverter.cpp index 5d8bd0d8434..8036b55a08f 100644 --- a/cpp/tests/t/pipelines/TransformationConverter.cpp +++ b/cpp/tests/t/pipelines/TransformationConverter.cpp @@ -14,10 +14,10 @@ namespace open3d { namespace tests { -class TransformationConverterPermuteDevices : public PermuteDevices {}; +class TransformationConverterPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(TransformationConverter, TransformationConverterPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(TransformationConverterPermuteDevices, RtToTransformation) { core::Device device = GetParam(); diff --git a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp index 3bb7ee36142..7d1b8b6466b 100644 --- a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp +++ b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp @@ -21,10 +21,10 @@ namespace open3d { namespace tests { -class OdometryPermuteDevices : public PermuteDevices {}; +class OdometryPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(Odometry, OdometryPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); core::Tensor CreateIntrisicTensor() { camera::PinholeCameraIntrinsic intrinsic = camera::PinholeCameraIntrinsic( diff --git a/cpp/tests/t/pipelines/registration/Feature.cpp b/cpp/tests/t/pipelines/registration/Feature.cpp index a3741a3ed8a..1266ab7f705 100644 --- a/cpp/tests/t/pipelines/registration/Feature.cpp +++ b/cpp/tests/t/pipelines/registration/Feature.cpp @@ -20,10 +20,10 @@ namespace open3d { namespace tests { -class FeaturePermuteDevices : public PermuteDevices {}; +class FeaturePermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(Feature, FeaturePermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(FeaturePermuteDevices, SelectByIndex) { core::Device device = GetParam(); diff --git a/cpp/tests/t/pipelines/registration/Registration.cpp b/cpp/tests/t/pipelines/registration/Registration.cpp index d823394c6bf..76b31da3dd9 100644 --- a/cpp/tests/t/pipelines/registration/Registration.cpp +++ b/cpp/tests/t/pipelines/registration/Registration.cpp @@ -26,10 +26,10 @@ namespace l_reg = open3d::pipelines::registration; namespace open3d { namespace tests { -class RegistrationPermuteDevices : public PermuteDevices {}; +class RegistrationPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(Registration, RegistrationPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(RegistrationPermuteDevices, ICPConvergenceCriteriaConstructor) { // Constructor. diff --git a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp index 2daca846d0a..c98ab268529 100644 --- a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp +++ b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp @@ -15,10 +15,10 @@ namespace open3d { namespace tests { -class TransformationEstimationPermuteDevices : public PermuteDevices {}; +class TransformationEstimationPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(TransformationEstimation, TransformationEstimationPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); static std:: tuple diff --git a/cpp/tests/t/pipelines/slac/ControlGrid.cpp b/cpp/tests/t/pipelines/slac/ControlGrid.cpp index 98bce9d5ef2..5e8e5d59949 100644 --- a/cpp/tests/t/pipelines/slac/ControlGrid.cpp +++ b/cpp/tests/t/pipelines/slac/ControlGrid.cpp @@ -24,10 +24,10 @@ static t::geometry::PointCloud CreateTPCDFromFile( return t::geometry::PointCloud::FromLegacy(*pcd, core::Float32, device); } -class ControlGridPermuteDevices : public PermuteDevices {}; +class ControlGridPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(ControlGrid, ControlGridPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); // TODO(wei): more well-designed test cases TEST_P(ControlGridPermuteDevices, Touch) { diff --git a/cpp/tests/t/pipelines/slac/SLAC.cpp b/cpp/tests/t/pipelines/slac/SLAC.cpp index bc31cb31044..73aef1ad212 100644 --- a/cpp/tests/t/pipelines/slac/SLAC.cpp +++ b/cpp/tests/t/pipelines/slac/SLAC.cpp @@ -27,10 +27,10 @@ namespace open3d { namespace tests { -class SLACPermuteDevices : public PermuteDevices {}; +class SLACPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(SLAC, SLACPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); // PointCloud is similar if fitness is higher and rmse is lower than tolerance // threshold. From 4cbdf15a900b822f4c5aeb0f7fa435e43218ddfb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:57:40 +0000 Subject: [PATCH 07/14] Optimize SYCL pipeline kernels: use nd_range + reduce_over_group instead of plain global atomics Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com> --- .../kernel/FillInLinearSystemSYCL.cpp | 144 ++++-- .../t/pipelines/kernel/RGBDOdometrySYCL.cpp | 463 +++++++++-------- .../t/pipelines/kernel/RegistrationSYCL.cpp | 466 ++++++++++-------- 3 files changed, 616 insertions(+), 457 deletions(-) diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp index 2f16deec4e8..c1c524005a7 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp @@ -36,7 +36,12 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, "Unable to setup linear system: input length mismatch."); } - // First fill in a small 12 x 12 linear system. + // First fill in a small 12 x 12 linear system using group reduction. + // kLocalDim = 12*12 + 12 + 1 = 157 elements. + static constexpr int kLocalDim12 = 144; // 12*12 + static constexpr int kLocalDimAtb = 12; + static constexpr int kLocalDimTotal = kLocalDim12 + kLocalDimAtb + 1; + core::Tensor AtA_local = core::Tensor::Zeros({12, 12}, core::Float32, device); core::Tensor Atb_local = core::Tensor::Zeros({12}, core::Float32, device); @@ -50,47 +55,102 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, const float *Ri_normal_ps_ptr = static_cast(Ri_normal_ps.GetDataPtr()); - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { - int64_t workload_idx = id[0]; - const float *p_prime = Ti_ps_ptr + 3 * workload_idx; - const float *q_prime = Tj_qs_ptr + 3 * workload_idx; - const float *normal_p_prime = Ri_normal_ps_ptr + 3 * workload_idx; - - float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] + - (p_prime[1] - q_prime[1]) * normal_p_prime[1] + - (p_prime[2] - q_prime[2]) * normal_p_prime[2]; - if (sycl::fabs(r) > threshold) return; - - float J_ij[12]; - J_ij[0] = -q_prime[2] * normal_p_prime[1] + - q_prime[1] * normal_p_prime[2]; - J_ij[1] = q_prime[2] * normal_p_prime[0] - - q_prime[0] * normal_p_prime[2]; - J_ij[2] = -q_prime[1] * normal_p_prime[0] + - q_prime[0] * normal_p_prime[1]; - J_ij[3] = normal_p_prime[0]; - J_ij[4] = normal_p_prime[1]; - J_ij[5] = normal_p_prime[2]; - for (int k = 0; k < 6; ++k) { - J_ij[k + 6] = -J_ij[k]; - } - - for (int i_local = 0; i_local < 12; ++i_local) { - for (int j_local = 0; j_local < 12; ++j_local) { - sycl::atomic_ref( - AtA_local_ptr[i_local * 12 + j_local]) += - J_ij[i_local] * J_ij[j_local]; - } - sycl::atomic_ref( - Atb_local_ptr[i_local]) += J_ij[i_local] * r; - } - sycl::atomic_ref(*residual_ptr) += - r * r; + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + // Private accumulation buffer: [AtA_local(144) | + // Atb_local(12) | residual(1)] + float local_sum[kLocalDimTotal] = {}; + + if (gid < n) { + const float *p_prime = Ti_ps_ptr + 3 * gid; + const float *q_prime = Tj_qs_ptr + 3 * gid; + const float *normal_p_prime = + Ri_normal_ps_ptr + 3 * gid; + + float r = + (p_prime[0] - q_prime[0]) * + normal_p_prime[0] + + (p_prime[1] - q_prime[1]) * + normal_p_prime[1] + + (p_prime[2] - q_prime[2]) * + normal_p_prime[2]; + + if (sycl::fabs(r) <= threshold) { + float J_ij[12]; + J_ij[0] = -q_prime[2] * normal_p_prime[1] + + q_prime[1] * normal_p_prime[2]; + J_ij[1] = q_prime[2] * normal_p_prime[0] - + q_prime[0] * normal_p_prime[2]; + J_ij[2] = -q_prime[1] * normal_p_prime[0] + + q_prime[0] * normal_p_prime[1]; + J_ij[3] = normal_p_prime[0]; + J_ij[4] = normal_p_prime[1]; + J_ij[5] = normal_p_prime[2]; + for (int k = 0; k < 6; ++k) { + J_ij[k + 6] = -J_ij[k]; + } + + for (int i_local = 0; i_local < 12; + ++i_local) { + for (int j_local = 0; j_local < 12; + ++j_local) { + local_sum[i_local * 12 + j_local] += + J_ij[i_local] * J_ij[j_local]; + } + local_sum[kLocalDim12 + i_local] += + J_ij[i_local] * r; + } + local_sum[kLocalDim12 + kLocalDimAtb] += + r * r; + } + } + + // Group-reduce each element and atomically add to + // the device-side local system buffers. + auto grp = item.get_group(); + for (int k = 0; k < kLocalDim12; ++k) { + float gv = sycl::reduce_over_group( + grp, local_sum[k], sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + AtA_local_ptr[k]) += gv; + } + } + for (int k = 0; k < kLocalDimAtb; ++k) { + float gv = sycl::reduce_over_group( + grp, local_sum[kLocalDim12 + k], + sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + Atb_local_ptr[k]) += gv; + } + } + { + float gv = sycl::reduce_over_group( + grp, + local_sum[kLocalDim12 + kLocalDimAtb], + sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref< + float, sycl::memory_order::acq_rel, + sycl::memory_scope::device>( + *residual_ptr) += gv; + } + } + }); }).wait_and_throw(); // Then fill-in the large linear system. diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp index d49ed13b4b1..59181b6d54c 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp @@ -25,15 +25,34 @@ using t::geometry::kernel::TransformIndexer; static constexpr int kReduceDimOdometry = 29; static constexpr int kJtJDimOdometry = 21; +/// Perform a group reduction over a fixed-size private float array using +/// sycl::reduce_over_group, then have work item 0 atomically accumulate each +/// group partial sum into the device-side global buffer. +template +static inline void GroupReduceAndAdd(sycl::nd_item<1> item, + const float (&local_sum)[N], + float *global_sum_ptr) { + auto grp = item.get_group(); + for (int k = 0; k < N; ++k) { + float grp_val = sycl::reduce_over_group(grp, local_sum[k], + sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref(global_sum_ptr[k]) += + grp_val; + } + } +} + void ComputeOdometryResultPointToPlaneSYCL( - const core::Tensor& source_vertex_map, - const core::Tensor& target_vertex_map, - const core::Tensor& target_normal_map, - const core::Tensor& intrinsics, - const core::Tensor& init_source_to_target, - core::Tensor& delta, - float& inlier_residual, - int& inlier_count, + const core::Tensor &source_vertex_map, + const core::Tensor &target_vertex_map, + const core::Tensor &target_normal_map, + const core::Tensor &intrinsics, + const core::Tensor &init_source_to_target, + core::Tensor &delta, + float &inlier_residual, + int &inlier_count, const float depth_outlier_trunc, const float depth_huber_delta) { NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); @@ -50,63 +69,71 @@ void ComputeOdometryResultPointToPlaneSYCL( core::Tensor global_sum = core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); - float* global_sum_ptr = global_sum.GetDataPtr(); - - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { - const int workload_idx = id[0]; - const int y = workload_idx / cols; - const int x = workload_idx % cols; - - float J[6] = {0}; - float r = 0; - const bool valid = GetJacobianPointToPlane( - x, y, depth_outlier_trunc, source_vertex_indexer, - target_vertex_indexer, target_normal_indexer, ti, J, r); - - if (valid) { - const float d_huber = HuberDeriv(r, depth_huber_delta); - const float r_huber = HuberLoss(r, depth_huber_delta); - - int offset = 0; - for (int i = 0; i < 6; ++i) { - for (int j = 0; j <= i; ++j) { - sycl::atomic_ref( - global_sum_ptr[offset++]) += J[i] * J[j]; - } - } - for (int i = 0; i < 6; ++i) { - sycl::atomic_ref( - global_sum_ptr[21 + i]) += J[i] * d_huber; - } - sycl::atomic_ref( - global_sum_ptr[27]) += r_huber; - sycl::atomic_ref( - global_sum_ptr[28]) += 1.0f; - } + float *global_sum_ptr = global_sum.GetDataPtr(); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + float local_sum[kReduceDimOdometry] = {}; + + if (gid < n) { + const int y = gid / cols; + const int x = gid % cols; + + float J[6] = {0}; + float r = 0; + const bool valid = GetJacobianPointToPlane( + x, y, depth_outlier_trunc, + source_vertex_indexer, + target_vertex_indexer, + target_normal_indexer, ti, J, r); + + if (valid) { + const float d_huber = + HuberDeriv(r, depth_huber_delta); + const float r_huber = + HuberLoss(r, depth_huber_delta); + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + local_sum[offset++] += J[i] * J[j]; + } + local_sum[21 + i] += J[i] * d_huber; + } + local_sum[27] += r_huber; + local_sum[28] += 1.0f; + } + } + + GroupReduceAndAdd( + item, local_sum, global_sum_ptr); + }); }).wait_and_throw(); DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); } void ComputeOdometryResultIntensitySYCL( - const core::Tensor& source_depth, - const core::Tensor& target_depth, - const core::Tensor& source_intensity, - const core::Tensor& target_intensity, - const core::Tensor& target_intensity_dx, - const core::Tensor& target_intensity_dy, - const core::Tensor& source_vertex_map, - const core::Tensor& intrinsics, - const core::Tensor& init_source_to_target, - core::Tensor& delta, - float& inlier_residual, - int& inlier_count, + const core::Tensor &source_depth, + const core::Tensor &target_depth, + const core::Tensor &source_intensity, + const core::Tensor &target_intensity, + const core::Tensor &target_intensity_dx, + const core::Tensor &target_intensity_dy, + const core::Tensor &source_vertex_map, + const core::Tensor &intrinsics, + const core::Tensor &init_source_to_target, + core::Tensor &delta, + float &inlier_residual, + int &inlier_count, const float depth_outlier_trunc, const float intensity_huber_delta) { NDArrayIndexer source_depth_indexer(source_depth, 2); @@ -127,70 +154,80 @@ void ComputeOdometryResultIntensitySYCL( core::Tensor global_sum = core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); - float* global_sum_ptr = global_sum.GetDataPtr(); - - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { - const int workload_idx = id[0]; - const int y = workload_idx / cols; - const int x = workload_idx % cols; - - float J_I[6] = {0}; - float r_I = 0; - const bool valid = GetJacobianIntensity( - x, y, depth_outlier_trunc, source_depth_indexer, - target_depth_indexer, source_intensity_indexer, - target_intensity_indexer, target_intensity_dx_indexer, - target_intensity_dy_indexer, source_vertex_indexer, ti, - J_I, r_I); - - if (valid) { - const float d_huber = HuberDeriv(r_I, intensity_huber_delta); - const float r_huber = HuberLoss(r_I, intensity_huber_delta); - - int offset = 0; - for (int i = 0; i < 6; ++i) { - for (int j = 0; j <= i; ++j) { - sycl::atomic_ref( - global_sum_ptr[offset++]) += J_I[i] * J_I[j]; - } - } - for (int i = 0; i < 6; ++i) { - sycl::atomic_ref( - global_sum_ptr[21 + i]) += J_I[i] * d_huber; - } - sycl::atomic_ref( - global_sum_ptr[27]) += r_huber; - sycl::atomic_ref( - global_sum_ptr[28]) += 1.0f; - } + float *global_sum_ptr = global_sum.GetDataPtr(); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + float local_sum[kReduceDimOdometry] = {}; + + if (gid < n) { + const int y = gid / cols; + const int x = gid % cols; + + float J_I[6] = {0}; + float r_I = 0; + const bool valid = GetJacobianIntensity( + x, y, depth_outlier_trunc, + source_depth_indexer, target_depth_indexer, + source_intensity_indexer, + target_intensity_indexer, + target_intensity_dx_indexer, + target_intensity_dy_indexer, + source_vertex_indexer, ti, J_I, r_I); + + if (valid) { + const float d_huber = + HuberDeriv(r_I, intensity_huber_delta); + const float r_huber = + HuberLoss(r_I, intensity_huber_delta); + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + local_sum[offset++] += + J_I[i] * J_I[j]; + } + local_sum[21 + i] += J_I[i] * d_huber; + } + local_sum[27] += r_huber; + local_sum[28] += 1.0f; + } + } + + GroupReduceAndAdd( + item, local_sum, global_sum_ptr); + }); }).wait_and_throw(); DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); } -void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth, - const core::Tensor& target_depth, - const core::Tensor& source_intensity, - const core::Tensor& target_intensity, - const core::Tensor& target_depth_dx, - const core::Tensor& target_depth_dy, - const core::Tensor& target_intensity_dx, - const core::Tensor& target_intensity_dy, - const core::Tensor& source_vertex_map, - const core::Tensor& intrinsics, - const core::Tensor& init_source_to_target, - core::Tensor& delta, - float& inlier_residual, - int& inlier_count, - const float depth_outlier_trunc, - const float depth_huber_delta, - const float intensity_huber_delta) { +void ComputeOdometryResultHybridSYCL( + const core::Tensor &source_depth, + const core::Tensor &target_depth, + const core::Tensor &source_intensity, + const core::Tensor &target_intensity, + const core::Tensor &target_depth_dx, + const core::Tensor &target_depth_dy, + const core::Tensor &target_intensity_dx, + const core::Tensor &target_intensity_dy, + const core::Tensor &source_vertex_map, + const core::Tensor &intrinsics, + const core::Tensor &init_source_to_target, + core::Tensor &delta, + float &inlier_residual, + int &inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta, + const float intensity_huber_delta) { NDArrayIndexer source_depth_indexer(source_depth, 2); NDArrayIndexer target_depth_indexer(target_depth, 2); NDArrayIndexer source_intensity_indexer(source_intensity, 2); @@ -211,62 +248,76 @@ void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth, core::Tensor global_sum = core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device); - float* global_sum_ptr = global_sum.GetDataPtr(); - - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { - const int workload_idx = id[0]; - const int y = workload_idx / cols; - const int x = workload_idx % cols; - - float J_I[6] = {0}, J_D[6] = {0}; - float r_I = 0, r_D = 0; - const bool valid = GetJacobianHybrid( - x, y, depth_outlier_trunc, source_depth_indexer, - target_depth_indexer, source_intensity_indexer, - target_intensity_indexer, target_depth_dx_indexer, - target_depth_dy_indexer, target_intensity_dx_indexer, - target_intensity_dy_indexer, source_vertex_indexer, ti, - J_I, J_D, r_I, r_D); - - if (valid) { - int offset = 0; - for (int i = 0; i < 6; ++i) { - for (int j = 0; j <= i; ++j) { - sycl::atomic_ref( - global_sum_ptr[offset++]) += - J_I[i] * J_I[j] + J_D[i] * J_D[j]; - } - } - for (int i = 0; i < 6; ++i) { - sycl::atomic_ref( - global_sum_ptr[21 + i]) += - J_I[i] * HuberDeriv(r_I, intensity_huber_delta) + - J_D[i] * HuberDeriv(r_D, depth_huber_delta); - } - sycl::atomic_ref( - global_sum_ptr[27]) += - HuberLoss(r_I, intensity_huber_delta) + - HuberLoss(r_D, depth_huber_delta); - sycl::atomic_ref( - global_sum_ptr[28]) += 1.0f; - } + float *global_sum_ptr = global_sum.GetDataPtr(); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + float local_sum[kReduceDimOdometry] = {}; + + if (gid < n) { + const int y = gid / cols; + const int x = gid % cols; + + float J_I[6] = {0}, J_D[6] = {0}; + float r_I = 0, r_D = 0; + const bool valid = GetJacobianHybrid( + x, y, depth_outlier_trunc, + source_depth_indexer, target_depth_indexer, + source_intensity_indexer, + target_intensity_indexer, + target_depth_dx_indexer, + target_depth_dy_indexer, + target_intensity_dx_indexer, + target_intensity_dy_indexer, + source_vertex_indexer, ti, J_I, J_D, r_I, + r_D); + + if (valid) { + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + local_sum[offset++] += + J_I[i] * J_I[j] + + J_D[i] * J_D[j]; + } + local_sum[21 + i] += + J_I[i] * HuberDeriv( + r_I, + intensity_huber_delta) + + J_D[i] * HuberDeriv(r_D, + depth_huber_delta); + } + local_sum[27] += + HuberLoss(r_I, intensity_huber_delta) + + HuberLoss(r_D, depth_huber_delta); + local_sum[28] += 1.0f; + } + } + + GroupReduceAndAdd( + item, local_sum, global_sum_ptr); + }); }).wait_and_throw(); DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); } -void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_vertex_map, - const core::Tensor& target_vertex_map, - const core::Tensor& intrinsic, - const core::Tensor& source_to_target, - const float square_dist_thr, - core::Tensor& information) { +void ComputeOdometryInformationMatrixSYCL( + const core::Tensor &source_vertex_map, + const core::Tensor &target_vertex_map, + const core::Tensor &intrinsic, + const core::Tensor &source_to_target, + const float square_dist_thr, + core::Tensor &information) { NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); NDArrayIndexer target_vertex_indexer(target_vertex_map, 2); @@ -280,41 +331,57 @@ void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_vertex_map, core::Tensor global_sum = core::Tensor::Zeros({kJtJDimOdometry}, core::Float32, device); - float* global_sum_ptr = global_sum.GetDataPtr(); - - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { - const int workload_idx = id[0]; - const int y = workload_idx / cols; - const int x = workload_idx % cols; - - float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; - float rx = 0, ry = 0, rz = 0; - const bool valid = GetJacobianPointToPoint( - x, y, square_dist_thr, source_vertex_indexer, - target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, rz); - - if (valid) { - int offset = 0; - for (int i = 0; i < 6; ++i) { - for (int j = 0; j <= i; ++j) { - sycl::atomic_ref( - global_sum_ptr[offset++]) += J_x[i] * J_x[j] + - J_y[i] * J_y[j] + - J_z[i] * J_z[j]; - } - } - } + float *global_sum_ptr = global_sum.GetDataPtr(); + + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + float local_sum[kJtJDimOdometry] = {}; + + if (gid < n) { + const int y = gid / cols; + const int x = gid % cols; + + float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; + float rx = 0, ry = 0, rz = 0; + const bool valid = GetJacobianPointToPoint( + x, y, square_dist_thr, + source_vertex_indexer, + target_vertex_indexer, ti, J_x, J_y, J_z, + rx, ry, rz); + + if (valid) { + int offset = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j <= i; ++j) { + local_sum[offset++] += + J_x[i] * J_x[j] + + J_y[i] * J_y[j] + + J_z[i] * J_z[j]; + } + } + } + } + + GroupReduceAndAdd(item, local_sum, + global_sum_ptr); + }); }).wait_and_throw(); const core::Device host(core::Device("CPU:0")); information = core::Tensor::Empty({6, 6}, core::Float64, host); global_sum = global_sum.To(host, core::Float64); - double* info_ptr = information.GetDataPtr(); - double* reduction_ptr = global_sum.GetDataPtr(); + double *info_ptr = information.GetDataPtr(); + double *reduction_ptr = global_sum.GetDataPtr(); for (int j = 0; j < 6; j++) { const int64_t reduction_idx = ((j * (j + 1)) / 2); for (int k = 0; k <= j; k++) { diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp index b44c696e7e4..80f4b2a4f73 100644 --- a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp @@ -21,6 +21,25 @@ namespace kernel { static constexpr int kReduceDim = 29; // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier) +/// Perform a group reduction over a fixed-size private array using +/// sycl::reduce_over_group, then have work item 0 atomically accumulate each +/// group partial sum into the device-side global buffer. +template +static inline void GroupReduceAndAdd(sycl::nd_item<1> item, + const scalar_t (&local_sum)[N], + scalar_t *global_sum_ptr) { + auto grp = item.get_group(); + for (int k = 0; k < N; ++k) { + scalar_t grp_val = sycl::reduce_over_group(grp, local_sum[k], + sycl::plus{}); + if (item.get_local_id(0) == 0) { + sycl::atomic_ref(global_sum_ptr[k]) += + grp_val; + } + } +} + void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, const core::Tensor &target_points, const core::Tensor &target_normals, @@ -35,8 +54,11 @@ void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { scalar_t *global_sum_ptr = global_sum.GetDataPtr(); @@ -53,60 +75,54 @@ void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, const int64_t *correspondence_indices_ptr = correspondence_indices.GetDataPtr(); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - const int workload_idx = id[0]; - scalar_t J_ij[6] = {0}; - scalar_t r = 0; - const bool valid = - GetJacobianPointToPlane( - workload_idx, - source_points_ptr, - target_points_ptr, - target_normals_ptr, - correspondence_indices_ptr, - J_ij, r); - - if (valid) { - const scalar_t w = - GetWeightFromRobustKernel(r); - int i = 0; - for (int j = 0; j < 6; ++j) { - for (int k = 0; k <= j; ++k) { - sycl::atomic_ref< - scalar_t, - sycl::memory_order:: - acq_rel, - sycl::memory_scope:: - device>( - global_sum_ptr[i]) += - J_ij[j] * w * J_ij[k]; - ++i; + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = + item.get_global_id(0); + scalar_t local_sum[kReduceDim] = {}; + + if (gid < n) { + scalar_t J_ij[6] = {0}; + scalar_t r = 0; + const bool valid = + GetJacobianPointToPlane< + scalar_t>( + gid, + source_points_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + J_ij, r); + + if (valid) { + const scalar_t w = + GetWeightFromRobustKernel( + r); + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; + ++k) { + local_sum[i++] += + J_ij[j] * w * + J_ij[k]; + } + local_sum[21 + j] += + J_ij[j] * w * r; + } + local_sum[27] += r; + local_sum[28] += + scalar_t(1); } - sycl::atomic_ref< - scalar_t, - sycl::memory_order:: - acq_rel, - sycl::memory_scope:: - device>( - global_sum_ptr[21 + j]) += - J_ij[j] * w * r; } - sycl::atomic_ref< - scalar_t, - sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[27]) += r; - sycl::atomic_ref< - scalar_t, - sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[28]) += - scalar_t(1); - } - }) - .wait_and_throw(); + + GroupReduceAndAdd( + item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); }); }); @@ -131,8 +147,11 @@ void ComputePoseColoredICPSYCL(const core::Tensor &source_points, core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { const scalar_t sqrt_lambda_geometric = @@ -159,72 +178,70 @@ void ComputePoseColoredICPSYCL(const core::Tensor &source_points, const int64_t *correspondence_indices_ptr = correspondence_indices.GetDataPtr(); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - const int workload_idx = id[0]; - scalar_t J_G[6] = {0}, J_I[6] = {0}; - scalar_t r_G = 0, r_I = 0; - - const bool valid = - GetJacobianColoredICP( - workload_idx, - source_points_ptr, - source_colors_ptr, - target_points_ptr, - target_normals_ptr, - target_colors_ptr, - target_color_gradients_ptr, - correspondence_indices_ptr, - sqrt_lambda_geometric, - sqrt_lambda_photometric, - J_G, J_I, r_G, r_I); - - if (valid) { - const scalar_t w_G = - GetWeightFromRobustKernel(r_G); - const scalar_t w_I = - GetWeightFromRobustKernel(r_I); - - int i = 0; - for (int j = 0; j < 6; ++j) { - for (int k = 0; k <= j; ++k) { - sycl::atomic_ref< - scalar_t, - sycl::memory_order:: - acq_rel, - sycl::memory_scope:: - device>( - global_sum_ptr[i]) += - J_G[j] * w_G * J_G[k] + - J_I[j] * w_I * J_I[k]; - ++i; + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = + item.get_global_id(0); + scalar_t local_sum[kReduceDim] = {}; + + if (gid < n) { + scalar_t J_G[6] = {0}, + J_I[6] = {0}; + scalar_t r_G = 0, r_I = 0; + + const bool valid = + GetJacobianColoredICP< + scalar_t>( + gid, + source_points_ptr, + source_colors_ptr, + target_points_ptr, + target_normals_ptr, + target_colors_ptr, + target_color_gradients_ptr, + correspondence_indices_ptr, + sqrt_lambda_geometric, + sqrt_lambda_photometric, + J_G, J_I, r_G, + r_I); + + if (valid) { + const scalar_t w_G = + GetWeightFromRobustKernel( + r_G); + const scalar_t w_I = + GetWeightFromRobustKernel( + r_I); + + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; + ++k) { + local_sum[i++] += + J_G[j] * w_G * + J_G[k] + + J_I[j] * w_I * + J_I[k]; + } + local_sum[21 + j] += + J_G[j] * w_G * + r_G + + J_I[j] * w_I * r_I; + } + local_sum[27] += + r_G * r_G + r_I * r_I; + local_sum[28] += scalar_t(1); } - sycl::atomic_ref< - scalar_t, - sycl::memory_order:: - acq_rel, - sycl::memory_scope:: - device>( - global_sum_ptr[21 + j]) += - J_G[j] * w_G * r_G + - J_I[j] * w_I * r_I; } - sycl::atomic_ref< - scalar_t, - sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[27]) += - r_G * r_G + r_I * r_I; - sycl::atomic_ref< - scalar_t, - sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[28]) += - scalar_t(1); - } - }) - .wait_and_throw(); + + GroupReduceAndAdd( + item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); }); }); @@ -258,8 +275,11 @@ void ComputePoseDopplerICPSYCL( core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device); core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device); - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { const scalar_t sqrt_lambda_geometric = @@ -278,8 +298,8 @@ void ComputePoseDopplerICPSYCL( scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr(); queue.single_task([=]() { PreComputeForDopplerICP(R_S_to_V_ptr, r_v_to_s_in_V_ptr, - w_v_in_V_ptr, v_v_in_V_ptr, - v_s_in_S_ptr); + w_v_in_V_ptr, v_v_in_V_ptr, + v_s_in_S_ptr); }).wait_and_throw(); } @@ -308,78 +328,76 @@ void ComputePoseDopplerICPSYCL( const scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr(); - queue.parallel_for( - sycl::range<1>{(size_t)n}, - [=](sycl::id<1> id) { - const int workload_idx = id[0]; - scalar_t J_G[6] = {0}, J_D[6] = {0}; - scalar_t r_G = 0, r_D = 0; - - const bool valid = GetJacobianDopplerICP< - scalar_t>( - workload_idx, source_points_ptr, - source_dopplers_ptr, - source_directions_ptr, - target_points_ptr, - target_normals_ptr, - correspondence_indices_ptr, - R_S_to_V_ptr, r_v_to_s_in_V_ptr, - v_s_in_S_ptr, - reject_dynamic_outliers, - static_cast( - doppler_outlier_threshold), - sqrt_lambda_geometric, - sqrt_lambda_doppler, - sqrt_lambda_doppler_by_dt, J_G, - J_D, r_G, r_D); - - if (valid) { - const scalar_t w_G = - GetWeightFromRobustKernelFirst( - r_G); - const scalar_t w_D = - GetWeightFromRobustKernelSecond( - r_D); - - int i = 0; - for (int j = 0; j < 6; ++j) { - for (int k = 0; k <= j; ++k) { - sycl::atomic_ref< - scalar_t, - sycl::memory_order:: - acq_rel, - sycl::memory_scope:: - device>( - global_sum_ptr[i]) += - J_G[j] * w_G * J_G[k] + - J_D[j] * w_D * J_D[k]; - ++i; + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = + item.get_global_id(0); + scalar_t local_sum[kReduceDim] = {}; + + if (gid < n) { + scalar_t J_G[6] = {0}, + J_D[6] = {0}; + scalar_t r_G = 0, r_D = 0; + + const bool valid = + GetJacobianDopplerICP< + scalar_t>( + gid, + source_points_ptr, + source_dopplers_ptr, + source_directions_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + R_S_to_V_ptr, + r_v_to_s_in_V_ptr, + v_s_in_S_ptr, + reject_dynamic_outliers, + static_cast( + doppler_outlier_threshold), + sqrt_lambda_geometric, + sqrt_lambda_doppler, + sqrt_lambda_doppler_by_dt, + J_G, J_D, r_G, + r_D); + + if (valid) { + const scalar_t w_G = + GetWeightFromRobustKernelFirst( + r_G); + const scalar_t w_D = + GetWeightFromRobustKernelSecond( + r_D); + + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; + ++k) { + local_sum[i++] += + J_G[j] * w_G * + J_G[k] + + J_D[j] * w_D * + J_D[k]; + } + local_sum[21 + j] += + J_G[j] * w_G * + r_G + + J_D[j] * w_D * r_D; + } + local_sum[27] += + r_G * r_G + r_D * r_D; + local_sum[28] += scalar_t(1); } - sycl::atomic_ref< - scalar_t, - sycl::memory_order:: - acq_rel, - sycl::memory_scope:: - device>( - global_sum_ptr[21 + j]) += - J_G[j] * w_G * r_G + - J_D[j] * w_D * r_D; } - sycl::atomic_ref< - scalar_t, - sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[27]) += - r_G * r_G + r_D * r_D; - sycl::atomic_ref< - scalar_t, - sycl::memory_order::acq_rel, - sycl::memory_scope::device>( - global_sum_ptr[28]) += - scalar_t(1); - } - }) - .wait_and_throw(); + + GroupReduceAndAdd( + item, local_sum, + global_sum_ptr); + }); + }).wait_and_throw(); }); }); @@ -395,8 +413,11 @@ void ComputeInformationMatrixSYCL(const core::Tensor &target_points, core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device); - sycl::queue queue = - core::sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto device_props = + core::sy::SYCLContext::GetInstance().GetDeviceProperties(device); + sycl::queue queue = device_props.queue; + const size_t wgs = device_props.max_work_group_size; + const size_t num_groups = ((size_t)n + wgs - 1) / wgs; DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { scalar_t *global_sum_ptr = global_sum.GetDataPtr(); @@ -405,27 +426,38 @@ void ComputeInformationMatrixSYCL(const core::Tensor &target_points, const int64_t *correspondence_indices_ptr = correspondence_indices.GetDataPtr(); - queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) { - const int workload_idx = id[0]; - scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0}; - const bool valid = GetInformationJacobians( - workload_idx, target_points_ptr, - correspondence_indices_ptr, J_x, J_y, J_z); - - if (valid) { - int i = 0; - for (int j = 0; j < 6; ++j) { - for (int k = 0; k <= j; ++k) { - sycl::atomic_ref( - global_sum_ptr[i]) += J_x[j] * J_x[k] + - J_y[j] * J_y[k] + - J_z[j] * J_z[k]; - ++i; - } - } - } + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>{num_groups * wgs, wgs}, + [=](sycl::nd_item<1> item) { + const int gid = item.get_global_id(0); + scalar_t local_sum[21] = {}; + + if (gid < n) { + scalar_t J_x[6] = {0}, J_y[6] = {0}, + J_z[6] = {0}; + const bool valid = + GetInformationJacobians( + gid, target_points_ptr, + correspondence_indices_ptr, + J_x, J_y, J_z); + + if (valid) { + int i = 0; + for (int j = 0; j < 6; ++j) { + for (int k = 0; k <= j; ++k) { + local_sum[i++] += + J_x[j] * J_x[k] + + J_y[j] * J_y[k] + + J_z[j] * J_z[k]; + } + } + } + } + + GroupReduceAndAdd<21, scalar_t>(item, local_sum, + global_sum_ptr); + }); }).wait_and_throw(); const core::Device host(core::Device("CPU:0")); From 27ccd459dd2f487bd202bd3ba6eec58cc89cf09b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 13 Mar 2026 18:13:32 +0000 Subject: [PATCH 08/14] Apply automatic code formatting --- cpp/open3d/data/Dataset.h | 20 +- cpp/open3d/geometry/PointCloud.h | 3 +- .../GlobalOptimizationConvergenceCriteria.h | 6 +- .../pipelines/registration/Registration.h | 6 +- .../registration/TransformationEstimation.h | 3 +- .../kernel/FillInLinearSystemSYCL.cpp | 37 +- .../t/pipelines/kernel/RGBDOdometrySYCL.cpp | 80 ++-- .../t/pipelines/kernel/RegistrationSYCL.cpp | 371 +++++++++++------- .../t/pipelines/TransformationConverter.cpp | 7 +- .../t/pipelines/odometry/RGBDOdometry.cpp | 7 +- .../t/pipelines/registration/Feature.cpp | 7 +- .../t/pipelines/registration/Registration.cpp | 7 +- .../registration/TransformationEstimation.cpp | 7 +- cpp/tests/t/pipelines/slac/ControlGrid.cpp | 7 +- cpp/tests/t/pipelines/slac/SLAC.cpp | 7 +- 15 files changed, 326 insertions(+), 249 deletions(-) diff --git a/cpp/open3d/data/Dataset.h b/cpp/open3d/data/Dataset.h index 27df77dfb14..e66a5370a2a 100644 --- a/cpp/open3d/data/Dataset.h +++ b/cpp/open3d/data/Dataset.h @@ -754,9 +754,9 @@ class PaintedPlasterTexture : public DownloadDataset { }; /// \class RedwoodIndoorLivingRoom1 -/// \brief Data class for `RedwoodIndoorLivingRoom1` (Augmented ICL-NUIM Dataset), -/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth -/// sequence, oni sequence, and ground-truth camera trajectory. +/// \brief Data class for `RedwoodIndoorLivingRoom1` (Augmented ICL-NUIM +/// Dataset), containing dense point cloud, rgb sequence, clean depth sequence, +/// noisy depth sequence, oni sequence, and ground-truth camera trajectory. /// /// RedwoodIndoorLivingRoom1 /// ├── colors @@ -810,9 +810,9 @@ class RedwoodIndoorLivingRoom1 : public DownloadDataset { }; /// \class RedwoodIndoorLivingRoom2 -/// \brief Data class for `RedwoodIndoorLivingRoom2` (Augmented ICL-NUIM Dataset), -/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth -/// sequence, oni sequence, and ground-truth camera trajectory. +/// \brief Data class for `RedwoodIndoorLivingRoom2` (Augmented ICL-NUIM +/// Dataset), containing dense point cloud, rgb sequence, clean depth sequence, +/// noisy depth sequence, oni sequence, and ground-truth camera trajectory. /// /// RedwoodIndoorLivingRoom2 /// ├── colors @@ -867,8 +867,8 @@ class RedwoodIndoorLivingRoom2 : public DownloadDataset { /// \class RedwoodIndoorOffice1 /// \brief Data class for `RedwoodIndoorOffice1` (Augmented ICL-NUIM Dataset), -/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth -/// sequence, oni sequence, and ground-truth camera trajectory. +/// containing dense point cloud, rgb sequence, clean depth sequence, noisy +/// depth sequence, oni sequence, and ground-truth camera trajectory. /// /// RedwoodIndoorOffice1 /// ├── colors @@ -923,8 +923,8 @@ class RedwoodIndoorOffice1 : public DownloadDataset { /// \class RedwoodIndoorOffice2 /// \brief Data class for `RedwoodIndoorOffice2` (Augmented ICL-NUIM Dataset), -/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth -/// sequence, oni sequence, and ground-truth camera trajectory. +/// containing dense point cloud, rgb sequence, clean depth sequence, noisy +/// depth sequence, oni sequence, and ground-truth camera trajectory. /// /// RedwoodIndoorOffice2 /// ├── colors diff --git a/cpp/open3d/geometry/PointCloud.h b/cpp/open3d/geometry/PointCloud.h index 66e26676d18..c528095637d 100644 --- a/cpp/open3d/geometry/PointCloud.h +++ b/cpp/open3d/geometry/PointCloud.h @@ -281,7 +281,8 @@ class PointCloud : public Geometry3D { /// /// /// \param input PointCloud to use for covariance computation. - /// \param search_param The KDTree search parameters for neighborhood search. + /// \param search_param The KDTree search parameters for neighborhood + /// search. static std::vector EstimatePerPointCovariances( const PointCloud &input, const KDTreeSearchParam &search_param = KDTreeSearchParamKNN()); diff --git a/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h b/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h index 892c92ee8ce..ba4d283df58 100644 --- a/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h +++ b/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h @@ -81,9 +81,9 @@ class GlobalOptimizationConvergenceCriteria { /// increments. /// \param min_right_term Minimum right term value. /// \param min_residual Minimum residual value. - /// \param max_iteration_lm Maximum iteration number for Levenberg Marquardt method. - /// \param upper_scale_factor Upper scale factor value. - /// \param lower_scale_factor Lower scale factor value. + /// \param max_iteration_lm Maximum iteration number for Levenberg Marquardt + /// method. \param upper_scale_factor Upper scale factor value. \param + /// lower_scale_factor Lower scale factor value. GlobalOptimizationConvergenceCriteria( int max_iteration = 100, double min_relative_increment = 1e-6, diff --git a/cpp/open3d/pipelines/registration/Registration.h b/cpp/open3d/pipelines/registration/Registration.h index 427e1a78cb9..c2c2f91108b 100644 --- a/cpp/open3d/pipelines/registration/Registration.h +++ b/cpp/open3d/pipelines/registration/Registration.h @@ -39,9 +39,9 @@ class ICPConvergenceCriteria { /// /// \param relative_fitness If relative change (difference) of fitness score /// is lower than relative_fitness, the iteration stops. - /// \param relative_rmse If relative change (difference) of inliner RMSE score is - /// lower than relative_rmse, the iteration stops. - /// \param max_iteration Maximum iteration before iteration stops. + /// \param relative_rmse If relative change (difference) of inliner RMSE + /// score is lower than relative_rmse, the iteration stops. \param + /// max_iteration Maximum iteration before iteration stops. ICPConvergenceCriteria(double relative_fitness = 1e-6, double relative_rmse = 1e-6, int max_iteration = 30) diff --git a/cpp/open3d/pipelines/registration/TransformationEstimation.h b/cpp/open3d/pipelines/registration/TransformationEstimation.h index a953f40ef90..40b3229f6ca 100644 --- a/cpp/open3d/pipelines/registration/TransformationEstimation.h +++ b/cpp/open3d/pipelines/registration/TransformationEstimation.h @@ -139,7 +139,8 @@ class TransformationEstimationPointToPlane : public TransformationEstimation { ~TransformationEstimationPointToPlane() override {} /// \brief Constructor that takes as input a RobustKernel. - /// \param kernel Any of the implemented statistical robust kernel for outlier rejection. + /// \param kernel Any of the implemented statistical robust kernel for + /// outlier rejection. explicit TransformationEstimationPointToPlane( std::shared_ptr kernel) : kernel_(std::move(kernel)) {} diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp index c1c524005a7..a43767ba40e 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp @@ -76,13 +76,12 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, const float *normal_p_prime = Ri_normal_ps_ptr + 3 * gid; - float r = - (p_prime[0] - q_prime[0]) * - normal_p_prime[0] + - (p_prime[1] - q_prime[1]) * - normal_p_prime[1] + - (p_prime[2] - q_prime[2]) * - normal_p_prime[2]; + float r = (p_prime[0] - q_prime[0]) * + normal_p_prime[0] + + (p_prime[1] - q_prime[1]) * + normal_p_prime[1] + + (p_prime[2] - q_prime[2]) * + normal_p_prime[2]; if (sycl::fabs(r) <= threshold) { float J_ij[12]; @@ -109,8 +108,7 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, local_sum[kLocalDim12 + i_local] += J_ij[i_local] * r; } - local_sum[kLocalDim12 + kLocalDimAtb] += - r * r; + local_sum[kLocalDim12 + kLocalDimAtb] += r * r; } } @@ -121,9 +119,9 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, float gv = sycl::reduce_over_group( grp, local_sum[k], sycl::plus{}); if (item.get_local_id(0) == 0) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( + sycl::atomic_ref( AtA_local_ptr[k]) += gv; } } @@ -132,21 +130,20 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA, grp, local_sum[kLocalDim12 + k], sycl::plus{}); if (item.get_local_id(0) == 0) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( + sycl::atomic_ref( Atb_local_ptr[k]) += gv; } } { float gv = sycl::reduce_over_group( - grp, - local_sum[kLocalDim12 + kLocalDimAtb], + grp, local_sum[kLocalDim12 + kLocalDimAtb], sycl::plus{}); if (item.get_local_id(0) == 0) { - sycl::atomic_ref< - float, sycl::memory_order::acq_rel, - sycl::memory_scope::device>( + sycl::atomic_ref( *residual_ptr) += gv; } } diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp index 59181b6d54c..c27b3159609 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp @@ -34,8 +34,8 @@ static inline void GroupReduceAndAdd(sycl::nd_item<1> item, float *global_sum_ptr) { auto grp = item.get_group(); for (int k = 0; k < N; ++k) { - float grp_val = sycl::reduce_over_group(grp, local_sum[k], - sycl::plus{}); + float grp_val = + sycl::reduce_over_group(grp, local_sum[k], sycl::plus{}); if (item.get_local_id(0) == 0) { sycl::atomic_ref(global_sum_ptr[k]) += @@ -113,8 +113,8 @@ void ComputeOdometryResultPointToPlaneSYCL( } } - GroupReduceAndAdd( - item, local_sum, global_sum_ptr); + GroupReduceAndAdd(item, local_sum, + global_sum_ptr); }); }).wait_and_throw(); @@ -192,8 +192,7 @@ void ComputeOdometryResultIntensitySYCL( int offset = 0; for (int i = 0; i < 6; ++i) { for (int j = 0; j <= i; ++j) { - local_sum[offset++] += - J_I[i] * J_I[j]; + local_sum[offset++] += J_I[i] * J_I[j]; } local_sum[21 + i] += J_I[i] * d_huber; } @@ -202,32 +201,31 @@ void ComputeOdometryResultIntensitySYCL( } } - GroupReduceAndAdd( - item, local_sum, global_sum_ptr); + GroupReduceAndAdd(item, local_sum, + global_sum_ptr); }); }).wait_and_throw(); DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); } -void ComputeOdometryResultHybridSYCL( - const core::Tensor &source_depth, - const core::Tensor &target_depth, - const core::Tensor &source_intensity, - const core::Tensor &target_intensity, - const core::Tensor &target_depth_dx, - const core::Tensor &target_depth_dy, - const core::Tensor &target_intensity_dx, - const core::Tensor &target_intensity_dy, - const core::Tensor &source_vertex_map, - const core::Tensor &intrinsics, - const core::Tensor &init_source_to_target, - core::Tensor &delta, - float &inlier_residual, - int &inlier_count, - const float depth_outlier_trunc, - const float depth_huber_delta, - const float intensity_huber_delta) { +void ComputeOdometryResultHybridSYCL(const core::Tensor &source_depth, + const core::Tensor &target_depth, + const core::Tensor &source_intensity, + const core::Tensor &target_intensity, + const core::Tensor &target_depth_dx, + const core::Tensor &target_depth_dy, + const core::Tensor &target_intensity_dx, + const core::Tensor &target_intensity_dy, + const core::Tensor &source_vertex_map, + const core::Tensor &intrinsics, + const core::Tensor &init_source_to_target, + core::Tensor &delta, + float &inlier_residual, + int &inlier_count, + const float depth_outlier_trunc, + const float depth_huber_delta, + const float intensity_huber_delta) { NDArrayIndexer source_depth_indexer(source_depth, 2); NDArrayIndexer target_depth_indexer(target_depth, 2); NDArrayIndexer source_intensity_indexer(source_intensity, 2); @@ -290,11 +288,14 @@ void ComputeOdometryResultHybridSYCL( J_D[i] * J_D[j]; } local_sum[21 + i] += - J_I[i] * HuberDeriv( - r_I, - intensity_huber_delta) + - J_D[i] * HuberDeriv(r_D, - depth_huber_delta); + J_I[i] * + HuberDeriv( + r_I, + intensity_huber_delta) + + J_D[i] * + HuberDeriv( + r_D, + depth_huber_delta); } local_sum[27] += HuberLoss(r_I, intensity_huber_delta) + @@ -303,21 +304,20 @@ void ComputeOdometryResultHybridSYCL( } } - GroupReduceAndAdd( - item, local_sum, global_sum_ptr); + GroupReduceAndAdd(item, local_sum, + global_sum_ptr); }); }).wait_and_throw(); DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count); } -void ComputeOdometryInformationMatrixSYCL( - const core::Tensor &source_vertex_map, - const core::Tensor &target_vertex_map, - const core::Tensor &intrinsic, - const core::Tensor &source_to_target, - const float square_dist_thr, - core::Tensor &information) { +void ComputeOdometryInformationMatrixSYCL(const core::Tensor &source_vertex_map, + const core::Tensor &target_vertex_map, + const core::Tensor &intrinsic, + const core::Tensor &source_to_target, + const float square_dist_thr, + core::Tensor &information) { NDArrayIndexer source_vertex_indexer(source_vertex_map, 2); NDArrayIndexer target_vertex_indexer(target_vertex_map, 2); diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp index 80f4b2a4f73..c7503191fdb 100644 --- a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp +++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp @@ -79,22 +79,19 @@ void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, cgh.parallel_for( sycl::nd_range<1>{num_groups * wgs, wgs}, [=](sycl::nd_item<1> item) { - const int gid = - item.get_global_id(0); + const int gid = item.get_global_id(0); scalar_t local_sum[kReduceDim] = {}; if (gid < n) { scalar_t J_ij[6] = {0}; scalar_t r = 0; - const bool valid = - GetJacobianPointToPlane< - scalar_t>( - gid, - source_points_ptr, - target_points_ptr, - target_normals_ptr, - correspondence_indices_ptr, - J_ij, r); + const bool valid = GetJacobianPointToPlane< + scalar_t>( + gid, source_points_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + J_ij, r); if (valid) { const scalar_t w = @@ -112,8 +109,7 @@ void ComputePosePointToPlaneSYCL(const core::Tensor &source_points, J_ij[j] * w * r; } local_sum[27] += r; - local_sum[28] += - scalar_t(1); + local_sum[28] += scalar_t(1); } } @@ -182,8 +178,7 @@ void ComputePoseColoredICPSYCL(const core::Tensor &source_points, cgh.parallel_for( sycl::nd_range<1>{num_groups * wgs, wgs}, [=](sycl::nd_item<1> item) { - const int gid = - item.get_global_id(0); + const int gid = item.get_global_id(0); scalar_t local_sum[kReduceDim] = {}; if (gid < n) { @@ -191,21 +186,18 @@ void ComputePoseColoredICPSYCL(const core::Tensor &source_points, J_I[6] = {0}; scalar_t r_G = 0, r_I = 0; - const bool valid = - GetJacobianColoredICP< - scalar_t>( - gid, - source_points_ptr, - source_colors_ptr, - target_points_ptr, - target_normals_ptr, - target_colors_ptr, - target_color_gradients_ptr, - correspondence_indices_ptr, - sqrt_lambda_geometric, - sqrt_lambda_photometric, - J_G, J_I, r_G, - r_I); + const bool valid = GetJacobianColoredICP< + scalar_t>( + gid, source_points_ptr, + source_colors_ptr, + target_points_ptr, + target_normals_ptr, + target_colors_ptr, + target_color_gradients_ptr, + correspondence_indices_ptr, + sqrt_lambda_geometric, + sqrt_lambda_photometric, + J_G, J_I, r_G, r_I); if (valid) { const scalar_t w_G = @@ -281,125 +273,205 @@ void ComputePoseDopplerICPSYCL( const size_t wgs = device_props.max_work_group_size; const size_t num_groups = ((size_t)n + wgs - 1) / wgs; - DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() { - const scalar_t sqrt_lambda_geometric = - sqrt(1.0 - static_cast(lambda_doppler)); - const scalar_t sqrt_lambda_doppler = sqrt(lambda_doppler); - const scalar_t sqrt_lambda_doppler_by_dt = - sqrt_lambda_doppler / static_cast(period); - - // Pre-compute v_s_in_S using a single-task kernel. - { - const scalar_t *R_S_to_V_ptr = R_S_to_V.GetDataPtr(); - const scalar_t *r_v_to_s_in_V_ptr = - r_v_to_s_in_V.GetDataPtr(); - const scalar_t *w_v_in_V_ptr = w_v_in_V.GetDataPtr(); - const scalar_t *v_v_in_V_ptr = v_v_in_V.GetDataPtr(); - scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr(); - queue.single_task([=]() { - PreComputeForDopplerICP(R_S_to_V_ptr, r_v_to_s_in_V_ptr, - w_v_in_V_ptr, v_v_in_V_ptr, - v_s_in_S_ptr); - }).wait_and_throw(); - } - - scalar_t *global_sum_ptr = global_sum.GetDataPtr(); - - DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION( - scalar_t, kernel_geometric.type_, - kernel_geometric.scaling_parameter_, kernel_doppler.type_, - kernel_doppler.scaling_parameter_, [&]() { - const scalar_t *source_points_ptr = - source_points.GetDataPtr(); - const scalar_t *source_dopplers_ptr = - source_dopplers.GetDataPtr(); - const scalar_t *source_directions_ptr = - source_directions.GetDataPtr(); - const scalar_t *target_points_ptr = - target_points.GetDataPtr(); - const scalar_t *target_normals_ptr = - target_normals.GetDataPtr(); - const int64_t *correspondence_indices_ptr = - correspondence_indices.GetDataPtr(); - const scalar_t *R_S_to_V_ptr = - R_S_to_V.GetDataPtr(); - const scalar_t *r_v_to_s_in_V_ptr = - r_v_to_s_in_V.GetDataPtr(); - const scalar_t *v_s_in_S_ptr = - v_s_in_S.GetDataPtr(); - - queue.submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<1>{num_groups * wgs, wgs}, - [=](sycl::nd_item<1> item) { - const int gid = - item.get_global_id(0); - scalar_t local_sum[kReduceDim] = {}; - - if (gid < n) { - scalar_t J_G[6] = {0}, - J_D[6] = {0}; - scalar_t r_G = 0, r_D = 0; - - const bool valid = - GetJacobianDopplerICP< - scalar_t>( - gid, - source_points_ptr, - source_dopplers_ptr, - source_directions_ptr, - target_points_ptr, - target_normals_ptr, - correspondence_indices_ptr, - R_S_to_V_ptr, - r_v_to_s_in_V_ptr, - v_s_in_S_ptr, - reject_dynamic_outliers, - static_cast( - doppler_outlier_threshold), - sqrt_lambda_geometric, - sqrt_lambda_doppler, - sqrt_lambda_doppler_by_dt, - J_G, J_D, r_G, - r_D); - - if (valid) { - const scalar_t w_G = - GetWeightFromRobustKernelFirst( - r_G); - const scalar_t w_D = - GetWeightFromRobustKernelSecond( - r_D); - - int i = 0; - for (int j = 0; j < 6; ++j) { - for (int k = 0; k <= j; - ++k) { - local_sum[i++] += - J_G[j] * w_G * - J_G[k] + - J_D[j] * w_D * - J_D[k]; - } - local_sum[21 + j] += - J_G[j] * w_G * - r_G + - J_D[j] * w_D * r_D; - } - local_sum[27] += - r_G * r_G + r_D * r_D; - local_sum[28] += scalar_t(1); - } + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, + [&]() { + const scalar_t sqrt_lambda_geometric = + sqrt(1.0 - + static_cast( + lambda_doppler)); + const scalar_t sqrt_lambda_doppler = + sqrt(lambda_doppler); + const scalar_t + sqrt_lambda_doppler_by_dt = + sqrt_lambda_doppler / + static_cast( + period); + + // Pre-compute v_s_in_S using a + // single-task kernel. + { + const scalar_t *R_S_to_V_ptr = + R_S_to_V.GetDataPtr< + scalar_t>(); + const scalar_t *r_v_to_s_in_V_ptr = + r_v_to_s_in_V.GetDataPtr< + scalar_t>(); + const scalar_t *w_v_in_V_ptr = + w_v_in_V.GetDataPtr< + scalar_t>(); + const scalar_t *v_v_in_V_ptr = + v_v_in_V.GetDataPtr< + scalar_t>(); + scalar_t *v_s_in_S_ptr = + v_s_in_S.GetDataPtr< + scalar_t>(); + queue.single_task([=]() { + PreComputeForDopplerICP( + R_S_to_V_ptr, + r_v_to_s_in_V_ptr, + w_v_in_V_ptr, + v_v_in_V_ptr, + v_s_in_S_ptr); + }).wait_and_throw(); } - GroupReduceAndAdd( - item, local_sum, - global_sum_ptr); + scalar_t *global_sum_ptr = + global_sum.GetDataPtr< + scalar_t>(); + + DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(scalar_t, + kernel_geometric + .type_, + kernel_geometric + .scaling_parameter_, + kernel_doppler + .type_, + kernel_doppler + .scaling_parameter_, + [&]() { + const scalar_t *source_points_ptr = + source_points + .GetDataPtr< + scalar_t>(); + const scalar_t *source_dopplers_ptr = + source_dopplers + .GetDataPtr< + scalar_t>(); + const scalar_t *source_directions_ptr = + source_directions + .GetDataPtr< + scalar_t>(); + const scalar_t *target_points_ptr = + target_points + .GetDataPtr< + scalar_t>(); + const scalar_t *target_normals_ptr = + target_normals + .GetDataPtr< + scalar_t>(); + const int64_t *correspondence_indices_ptr = + correspondence_indices + .GetDataPtr< + int64_t>(); + const scalar_t *R_S_to_V_ptr = + R_S_to_V.GetDataPtr< + scalar_t>(); + const scalar_t *r_v_to_s_in_V_ptr = + r_v_to_s_in_V + .GetDataPtr< + scalar_t>(); + const scalar_t *v_s_in_S_ptr = + v_s_in_S.GetDataPtr< + scalar_t>(); + + queue + .submit([&](sycl::handler + &cgh) { + cgh.parallel_for( + sycl::nd_range< + 1>{ + num_groups * + wgs, + wgs}, + [=](sycl::nd_item< + 1> item) { + const int gid = + item.get_global_id( + 0); + scalar_t local_sum + [kReduceDim] = + {}; + + if (gid < + n) { + scalar_t J_G + [6] = {0}, + J_D[6] = { + 0}; + scalar_t + r_G = 0, + r_D = 0; + + const bool valid = GetJacobianDopplerICP< + scalar_t>( + gid, + source_points_ptr, + source_dopplers_ptr, + source_directions_ptr, + target_points_ptr, + target_normals_ptr, + correspondence_indices_ptr, + R_S_to_V_ptr, + r_v_to_s_in_V_ptr, + v_s_in_S_ptr, + reject_dynamic_outliers, + static_cast< + scalar_t>( + doppler_outlier_threshold), + sqrt_lambda_geometric, + sqrt_lambda_doppler, + sqrt_lambda_doppler_by_dt, + J_G, + J_D, + r_G, + r_D); + + if (valid) { + const scalar_t + w_G = GetWeightFromRobustKernelFirst( + r_G); + const scalar_t + w_D = GetWeightFromRobustKernelSecond( + r_D); + + int i = 0; + for (int j = 0; + j < + 6; + ++j) { + for (int k = 0; + k <= + j; + ++k) { + local_sum + [i++] += + J_G[j] * + w_G * + J_G[k] + + J_D[j] * + w_D * + J_D[k]; + } + local_sum + [21 + + j] += + J_G[j] * + w_G * + r_G + + J_D[j] * + w_D * + r_D; + } + local_sum + [27] += + r_G * r_G + + r_D * r_D; + local_sum[28] += scalar_t( + 1); + } + } + + GroupReduceAndAdd< + kReduceDim, + scalar_t>( + item, + local_sum, + global_sum_ptr); + }); + }) + .wait_and_throw(); + }); }); - }).wait_and_throw(); - }); - }); DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count); } @@ -446,10 +518,9 @@ void ComputeInformationMatrixSYCL(const core::Tensor &target_points, int i = 0; for (int j = 0; j < 6; ++j) { for (int k = 0; k <= j; ++k) { - local_sum[i++] += - J_x[j] * J_x[k] + - J_y[j] * J_y[k] + - J_z[j] * J_z[k]; + local_sum[i++] += J_x[j] * J_x[k] + + J_y[j] * J_y[k] + + J_z[j] * J_z[k]; } } } diff --git a/cpp/tests/t/pipelines/TransformationConverter.cpp b/cpp/tests/t/pipelines/TransformationConverter.cpp index 8036b55a08f..a4c092c62d6 100644 --- a/cpp/tests/t/pipelines/TransformationConverter.cpp +++ b/cpp/tests/t/pipelines/TransformationConverter.cpp @@ -15,9 +15,10 @@ namespace open3d { namespace tests { class TransformationConverterPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(TransformationConverter, - TransformationConverterPermuteDevices, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); +INSTANTIATE_TEST_SUITE_P( + TransformationConverter, + TransformationConverterPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(TransformationConverterPermuteDevices, RtToTransformation) { core::Device device = GetParam(); diff --git a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp index 7d1b8b6466b..64d2a97d4a7 100644 --- a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp +++ b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp @@ -22,9 +22,10 @@ namespace open3d { namespace tests { class OdometryPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(Odometry, - OdometryPermuteDevices, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); +INSTANTIATE_TEST_SUITE_P( + Odometry, + OdometryPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); core::Tensor CreateIntrisicTensor() { camera::PinholeCameraIntrinsic intrinsic = camera::PinholeCameraIntrinsic( diff --git a/cpp/tests/t/pipelines/registration/Feature.cpp b/cpp/tests/t/pipelines/registration/Feature.cpp index 1266ab7f705..7cb5f27ea8c 100644 --- a/cpp/tests/t/pipelines/registration/Feature.cpp +++ b/cpp/tests/t/pipelines/registration/Feature.cpp @@ -21,9 +21,10 @@ namespace open3d { namespace tests { class FeaturePermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(Feature, - FeaturePermuteDevices, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); +INSTANTIATE_TEST_SUITE_P( + Feature, + FeaturePermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(FeaturePermuteDevices, SelectByIndex) { core::Device device = GetParam(); diff --git a/cpp/tests/t/pipelines/registration/Registration.cpp b/cpp/tests/t/pipelines/registration/Registration.cpp index 76b31da3dd9..d3d0544e27c 100644 --- a/cpp/tests/t/pipelines/registration/Registration.cpp +++ b/cpp/tests/t/pipelines/registration/Registration.cpp @@ -27,9 +27,10 @@ namespace open3d { namespace tests { class RegistrationPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(Registration, - RegistrationPermuteDevices, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); +INSTANTIATE_TEST_SUITE_P( + Registration, + RegistrationPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(RegistrationPermuteDevices, ICPConvergenceCriteriaConstructor) { // Constructor. diff --git a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp index c98ab268529..afc7f40e7a8 100644 --- a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp +++ b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp @@ -16,9 +16,10 @@ namespace open3d { namespace tests { class TransformationEstimationPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(TransformationEstimation, - TransformationEstimationPermuteDevices, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); +INSTANTIATE_TEST_SUITE_P( + TransformationEstimation, + TransformationEstimationPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); static std:: tuple diff --git a/cpp/tests/t/pipelines/slac/ControlGrid.cpp b/cpp/tests/t/pipelines/slac/ControlGrid.cpp index 5e8e5d59949..7f716f553ee 100644 --- a/cpp/tests/t/pipelines/slac/ControlGrid.cpp +++ b/cpp/tests/t/pipelines/slac/ControlGrid.cpp @@ -25,9 +25,10 @@ static t::geometry::PointCloud CreateTPCDFromFile( } class ControlGridPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(ControlGrid, - ControlGridPermuteDevices, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); +INSTANTIATE_TEST_SUITE_P( + ControlGrid, + ControlGridPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); // TODO(wei): more well-designed test cases TEST_P(ControlGridPermuteDevices, Touch) { diff --git a/cpp/tests/t/pipelines/slac/SLAC.cpp b/cpp/tests/t/pipelines/slac/SLAC.cpp index 73aef1ad212..00926a38741 100644 --- a/cpp/tests/t/pipelines/slac/SLAC.cpp +++ b/cpp/tests/t/pipelines/slac/SLAC.cpp @@ -28,9 +28,10 @@ namespace open3d { namespace tests { class SLACPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(SLAC, - SLACPermuteDevices, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); +INSTANTIATE_TEST_SUITE_P( + SLAC, + SLACPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); // PointCloud is similar if fitness is higher and rmse is lower than tolerance // threshold. From 83753680dc361499f8810968156931bef19ab8f9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 07:18:18 +0000 Subject: [PATCH 09/14] Initial plan From 4018268dcb51e73d0d9a4f4228b2aa666f9f89fd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 07:30:51 +0000 Subject: [PATCH 10/14] Add SYCL nearest-neighbor search support Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com> --- cpp/open3d/core/CMakeLists.txt | 1 + cpp/open3d/core/nns/FixedRadiusIndex.cpp | 28 + cpp/open3d/core/nns/FixedRadiusIndex.h | 34 ++ cpp/open3d/core/nns/KnnIndex.cpp | 26 +- cpp/open3d/core/nns/KnnIndex.h | 18 +- cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp | 511 ++++++++++++++++++ cpp/open3d/core/nns/NearestNeighborSearch.cpp | 47 +- cpp/open3d/core/nns/NearestNeighborSearch.h | 20 +- .../core/nns/nearest_neighbor_search.cpp | 14 +- cpp/tests/core/NearestNeighborSearch.cpp | 2 +- python/test/core/test_nn.py | 6 +- 11 files changed, 670 insertions(+), 37 deletions(-) create mode 100644 cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt index 210cc33058a..7821b4a8f97 100644 --- a/cpp/open3d/core/CMakeLists.txt +++ b/cpp/open3d/core/CMakeLists.txt @@ -102,6 +102,7 @@ open3d_sycl_target_sources(core_impl PRIVATE linalg/LeastSquaresSYCL.cpp linalg/LUSYCL.cpp linalg/MatmulSYCL.cpp + nns/KnnSearchOpsSYCL.cpp linalg/SolveSYCL.cpp linalg/SVDSYCL.cpp linalg/TriSYCL.cpp diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.cpp b/cpp/open3d/core/nns/FixedRadiusIndex.cpp index 167a6701a32..b1254c396e1 100644 --- a/cpp/open3d/core/nns/FixedRadiusIndex.cpp +++ b/cpp/open3d/core/nns/FixedRadiusIndex.cpp @@ -106,6 +106,14 @@ bool FixedRadiusIndex::SetTensorData(const Tensor &dataset_points, utility::LogError( "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " "-DBUILD_CUDA_MODULE=ON."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + return true; +#else + utility::LogError( + "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with " + "-DBUILD_SYCL_MODULE=ON."); #endif } else { CALL_BUILD(float, BuildSpatialHashTableCPU) @@ -173,6 +181,16 @@ std::tuple FixedRadiusIndex::SearchRadius( utility::LogError( "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " "-DBUILD_CUDA_MODULE=ON."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { + FixedRadiusSearchSYCL(RADIUS_PARAMETERS); + }); +#else + utility::LogError( + "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with " + "-DBUILD_SYCL_MODULE=ON."); #endif } else { DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { @@ -242,6 +260,16 @@ std::tuple FixedRadiusIndex::SearchHybrid( utility::LogError( "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " "-DBUILD_CUDA_MODULE=ON."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { + HybridSearchSYCL(HYBRID_PARAMETERS); + }); +#else + utility::LogError( + "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with " + "-DBUILD_SYCL_MODULE=ON."); #endif } else { DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.h b/cpp/open3d/core/nns/FixedRadiusIndex.h index 838c1ccc484..d3247bb7d29 100644 --- a/cpp/open3d/core/nns/FixedRadiusIndex.h +++ b/cpp/open3d/core/nns/FixedRadiusIndex.h @@ -376,6 +376,40 @@ void HybridSearchCUDA(const Tensor& points, Tensor& neighbors_distance); #endif +#ifdef BUILD_SYCL_MODULE +template +void FixedRadiusSearchSYCL(const Tensor& points, + const Tensor& queries, + double radius, + const Tensor& points_row_splits, + const Tensor& queries_row_splits, + const Tensor& hash_table_splits, + const Tensor& hash_table_index, + const Tensor& hash_table_cell_splits, + const Metric metric, + const bool ignore_query_point, + const bool return_distances, + const bool sort, + Tensor& neighbors_index, + Tensor& neighbors_row_splits, + Tensor& neighbors_distance); + +template +void HybridSearchSYCL(const Tensor& points, + const Tensor& queries, + double radius, + int max_knn, + const Tensor& points_row_splits, + const Tensor& queries_row_splits, + const Tensor& hash_table_splits, + const Tensor& hash_table_index, + const Tensor& hash_table_cell_splits, + const Metric metric, + Tensor& neighbors_index, + Tensor& neighbors_count, + Tensor& neighbors_distance); +#endif + /// \class FixedRadiusIndex /// /// \brief FixedRadiusIndex for nearest neighbor range search. diff --git a/cpp/open3d/core/nns/KnnIndex.cpp b/cpp/open3d/core/nns/KnnIndex.cpp index 2d1f5de6b51..596ad92d69d 100644 --- a/cpp/open3d/core/nns/KnnIndex.cpp +++ b/cpp/open3d/core/nns/KnnIndex.cpp @@ -67,6 +67,17 @@ bool KnnIndex::SetTensorData(const Tensor& dataset_points, utility::LogError( "GPU Tensor is not supported when -DBUILD_CUDA_MODULE=OFF. " "Please recompile Open3d With -DBUILD_CUDA_MODULE=ON."); +#endif + } else if (dataset_points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + dataset_points_ = dataset_points.Contiguous(); + points_row_splits_ = points_row_splits.Contiguous(); + index_dtype_ = index_dtype; + return true; +#else + utility::LogError( + "SYCL Tensor is not supported when -DBUILD_SYCL_MODULE=OFF. " + "Please recompile Open3D with -DBUILD_SYCL_MODULE=ON."); #endif } else { utility::LogError( @@ -126,11 +137,22 @@ std::pair KnnIndex::SearchKnn(const Tensor& query_points, utility::LogError( "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " "-DBUILD_CUDA_MODULE=ON."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + const Dtype index_dtype = GetIndexDtype(); + DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() { + KnnSearchSYCL(KNN_PARAMETERS); + }); +#else + utility::LogError( + "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with " + "-DBUILD_SYCL_MODULE=ON."); #endif } else { utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " - "-DBUILD_CUDA_MODULE=ON."); + "KnnIndex only supports CUDA and SYCL tensors. Please use " + "NanoFlannIndex instead for CPU tensors."); } return std::make_pair(neighbors_index, neighbors_distance); } diff --git a/cpp/open3d/core/nns/KnnIndex.h b/cpp/open3d/core/nns/KnnIndex.h index 458a3dd0b35..6628c92850c 100644 --- a/cpp/open3d/core/nns/KnnIndex.h +++ b/cpp/open3d/core/nns/KnnIndex.h @@ -29,14 +29,28 @@ void KnnSearchCUDA(const Tensor& points, Tensor& neighbors_distance); #endif +#ifdef BUILD_SYCL_MODULE +template +void KnnSearchSYCL(const Tensor& points, + const Tensor& points_row_splits, + const Tensor& queries, + const Tensor& queries_row_splits, + int knn, + Tensor& neighbors_index, + Tensor& neighbors_row_splits, + Tensor& neighbors_distance); +#endif + class KnnIndex : public NNSIndex { public: KnnIndex(); /// \brief Parameterized Constructor. /// - /// \param dataset_points Provides a set of data points as Tensor for KDTree - /// construction. + /// \param dataset_points Provides a set of data points as Tensor for nearest + /// neighbor search. CPU tensors use NanoFlann through + /// open3d::core::nns::NearestNeighborSearch, while CUDA and SYCL tensors are + /// handled by this class. KnnIndex(const Tensor& dataset_points); KnnIndex(const Tensor& dataset_points, const Dtype& index_dtype); ~KnnIndex(); diff --git a/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp new file mode 100644 index 00000000000..5bb4f2d02a7 --- /dev/null +++ b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp @@ -0,0 +1,511 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include + +#include "open3d/core/MemoryManager.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/nns/FixedRadiusIndex.h" +#include "open3d/core/nns/KnnIndex.h" +#include "open3d/core/nns/NeighborSearchAllocator.h" +#include "open3d/utility/Helper.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +namespace nns { + +namespace { + +template +void ComputeSquaredDistancesSYCL(const Device& device, + const T* points_ptr, + int64_t num_points, + int64_t dimension, + const T* query_ptr, + T* distances_ptr) { + if (num_points == 0) { + return; + } + + constexpr int64_t kWorkGroupSize = 128; + const int64_t global_size = + utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize; + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + queue.submit([&](sycl::handler& cgh) { + sycl::local_accessor local_query( + sycl::range<1>(dimension), cgh); + cgh.parallel_for( + sycl::nd_range<1>(sycl::range<1>(global_size), + sycl::range<1>(kWorkGroupSize)), + [=](sycl::nd_item<1> item) { + const int64_t lid = item.get_local_linear_id(); + for (int64_t d = lid; d < dimension; + d += item.get_local_range(0)) { + local_query[d] = query_ptr[d]; + } + item.barrier(sycl::access::fence_space::local_space); + + const int64_t point_idx = item.get_global_linear_id(); + if (point_idx >= num_points) { + return; + } + + const T* point = points_ptr + point_idx * dimension; + T dist = 0; + for (int64_t d = 0; d < dimension; ++d) { + const T diff = point[d] - local_query[d]; + dist += diff * diff; + } + distances_ptr[point_idx] = dist; + }); + }).wait_and_throw(); +} + +template +void SortIndicesByDistanceSYCL(const Device& device, + const T* distances_ptr, + TIndex* indices_ptr, + int64_t num_points) { + if (num_points <= 1) { + return; + } + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + std::sort(policy, indices_ptr, indices_ptr + num_points, + [distances_ptr](TIndex lhs, TIndex rhs) { + const T lhs_dist = distances_ptr[lhs]; + const T rhs_dist = distances_ptr[rhs]; + if (lhs_dist < rhs_dist) { + return true; + } + if (rhs_dist < lhs_dist) { + return false; + } + return lhs < rhs; + }); + queue.wait_and_throw(); +} + +template +int64_t CountWithinRadiusSYCL(const Device& device, + const T* distances_ptr, + const TIndex* sorted_indices_ptr, + int64_t num_points, + T threshold) { + if (num_points == 0) { + return 0; + } + + constexpr int64_t kWorkGroupSize = 128; + const int64_t global_size = + utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize; + Tensor count = Tensor::Zeros({1}, Int64, device); + int64_t* count_ptr = count.GetDataPtr(); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + queue.submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::nd_range<1>(sycl::range<1>(global_size), + sycl::range<1>(kWorkGroupSize)), + [=](sycl::nd_item<1> item) { + const int64_t idx = item.get_global_linear_id(); + int local_count = 0; + if (idx < num_points) { + local_count = + distances_ptr[sorted_indices_ptr[idx]] <= + threshold + ? 1 + : 0; + } + auto group = item.get_group(); + const int group_count = sycl::reduce_over_group( + group, local_count, sycl::plus()); + if (item.get_local_linear_id() == 0 && + group_count > 0) { + sycl::atomic_ref + count_ref(*count_ptr); + count_ref += group_count; + } + }); + }).wait_and_throw(); + + return count.To(Device("CPU:0")).Item(); +} + +template +void GatherResultsSYCL(const Device& device, + const T* distances_ptr, + const TIndex* sorted_indices_ptr, + int64_t num_results, + TIndex index_offset, + TIndex* out_indices_ptr, + T* out_distances_ptr) { + if (num_results == 0) { + return; + } + + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>(num_results), [=](sycl::id<1> id) { + const int64_t i = id[0]; + const TIndex point_idx = sorted_indices_ptr[i]; + out_indices_ptr[i] = point_idx + index_offset; + if (out_distances_ptr != nullptr) { + out_distances_ptr[i] = distances_ptr[point_idx]; + } + }).wait_and_throw(); +} + +template +void ProcessQuerySYCL(const Tensor& points, + const Tensor& query, + int64_t num_results, + TIndex index_offset, + TIndex* out_indices_ptr, + T* out_distances_ptr) { + const Device device = points.GetDevice(); + const int64_t num_points = points.GetShape(0); + + Tensor distances = Tensor::Empty({num_points}, points.GetDtype(), device); + Tensor sorted_indices = Tensor::Arange( + 0, num_points, 1, Dtype::FromType(), device); + + ComputeSquaredDistancesSYCL(device, points.GetDataPtr(), num_points, + points.GetShape(1), query.GetDataPtr(), + distances.GetDataPtr()); + SortIndicesByDistanceSYCL(device, distances.GetDataPtr(), + sorted_indices.GetDataPtr(), + num_points); + GatherResultsSYCL(device, distances.GetDataPtr(), + sorted_indices.GetDataPtr(), + num_results, index_offset, out_indices_ptr, + out_distances_ptr); +} + +template +int64_t CountQueryResultsWithinRadiusSYCL(const Tensor& points, + const Tensor& query, + T threshold) { + const Device device = points.GetDevice(); + const int64_t num_points = points.GetShape(0); + + Tensor distances = Tensor::Empty({num_points}, points.GetDtype(), device); + Tensor sorted_indices = Tensor::Arange( + 0, num_points, 1, Dtype::FromType(), device); + + ComputeSquaredDistancesSYCL(device, points.GetDataPtr(), num_points, + points.GetShape(1), query.GetDataPtr(), + distances.GetDataPtr()); + SortIndicesByDistanceSYCL(device, distances.GetDataPtr(), + sorted_indices.GetDataPtr(), + num_points); + return CountWithinRadiusSYCL( + device, distances.GetDataPtr(), sorted_indices.GetDataPtr(), + num_points, threshold); +} + +template +void GatherQueryResultsWithinRadiusSYCL(const Tensor& points, + const Tensor& query, + int64_t num_results, + TIndex index_offset, + TIndex* out_indices_ptr, + T* out_distances_ptr) { + ProcessQuerySYCL(points, query, num_results, index_offset, + out_indices_ptr, out_distances_ptr); +} + +} // namespace + +template +void KnnSearchSYCL(const Tensor& points, + const Tensor& points_row_splits, + const Tensor& queries, + const Tensor& queries_row_splits, + int knn, + Tensor& neighbors_index, + Tensor& neighbors_row_splits, + Tensor& neighbors_distance) { + const Device device = points.GetDevice(); + const int batch_size = points_row_splits.GetShape(0) - 1; + std::vector> batch_output_allocators( + batch_size, NeighborSearchAllocator(device)); + + int64_t* neighbors_row_splits_ptr = + neighbors_row_splits.GetDataPtr(); + int64_t last_neighbors_count = 0; + int64_t batch_knn = 0; + + for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) { + const int64_t point_begin = points_row_splits[batch_idx].Item(); + const int64_t point_end = points_row_splits[batch_idx + 1].Item(); + const int64_t query_begin = + queries_row_splits[batch_idx].Item(); + const int64_t query_end = + queries_row_splits[batch_idx + 1].Item(); + + const Tensor points_i = points.Slice(0, point_begin, point_end); + const Tensor queries_i = queries.Slice(0, query_begin, query_end); + const int64_t num_points_i = points_i.GetShape(0); + const int64_t num_queries_i = queries_i.GetShape(0); + batch_knn = std::min(knn, num_points_i); + + neighbors_row_splits_ptr[query_begin] = last_neighbors_count; + for (int64_t q = 0; q < num_queries_i; ++q) { + neighbors_row_splits_ptr[query_begin + q + 1] = + last_neighbors_count + (q + 1) * batch_knn; + } + last_neighbors_count += num_queries_i * batch_knn; + + TIndex* indices_ptr; + T* distances_ptr; + batch_output_allocators[batch_idx].AllocIndices( + &indices_ptr, num_queries_i * batch_knn, TIndex(-1)); + batch_output_allocators[batch_idx].AllocDistances( + &distances_ptr, num_queries_i * batch_knn, T(0)); + + for (int64_t q = 0; q < num_queries_i; ++q) { + const Tensor query = queries_i.Slice(0, q, q + 1).Flatten(); + ProcessQuerySYCL( + points_i, query, batch_knn, TIndex(point_begin), + indices_ptr + q * batch_knn, distances_ptr + q * batch_knn); + } + } + + if (batch_size == 1) { + neighbors_index = + batch_output_allocators[0].NeighborsIndex().View({queries.GetShape(0), + batch_knn}); + neighbors_distance = batch_output_allocators[0].NeighborsDistance().View( + {queries.GetShape(0), batch_knn}); + return; + } + + NeighborSearchAllocator output_allocator(device); + int64_t neighbors_size = 0; + for (const auto& allocator : batch_output_allocators) { + neighbors_size += allocator.NeighborsIndex().GetShape(0); + } + + TIndex* neighbors_index_ptr; + T* neighbors_distance_ptr; + output_allocator.AllocIndices(&neighbors_index_ptr, neighbors_size); + output_allocator.AllocDistances(&neighbors_distance_ptr, neighbors_size); + + int64_t offset = 0; + for (const auto& allocator : batch_output_allocators) { + const int64_t batch_size_i = allocator.NeighborsIndex().GetShape(0); + if (batch_size_i == 0) { + continue; + } + MemoryManager::Memcpy(neighbors_index_ptr + offset, device, + allocator.IndicesPtr(), device, + sizeof(TIndex) * batch_size_i); + MemoryManager::Memcpy(neighbors_distance_ptr + offset, device, + allocator.DistancesPtr(), device, + sizeof(T) * batch_size_i); + offset += batch_size_i; + } + neighbors_index = output_allocator.NeighborsIndex(); + neighbors_distance = output_allocator.NeighborsDistance(); +} + +template +void FixedRadiusSearchSYCL(const Tensor& points, + const Tensor& queries, + double radius, + const Tensor& points_row_splits, + const Tensor& queries_row_splits, + const Tensor&, + const Tensor&, + const Tensor&, + const Metric metric, + const bool ignore_query_point, + const bool return_distances, + const bool, + Tensor& neighbors_index, + Tensor& neighbors_row_splits, + Tensor& neighbors_distance) { + if (metric != Metric::L2) { + utility::LogError("SYCL fixed radius search only supports L2 metric."); + } + if (ignore_query_point) { + utility::LogError( + "SYCL fixed radius search does not support ignore_query_point."); + } + + const Device device = points.GetDevice(); + const int64_t num_queries = queries.GetShape(0); + const T threshold = static_cast(radius * radius); + std::vector counts(num_queries, 0); + + for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; + ++batch_idx) { + const int64_t point_begin = points_row_splits[batch_idx].Item(); + const int64_t point_end = points_row_splits[batch_idx + 1].Item(); + const int64_t query_begin = + queries_row_splits[batch_idx].Item(); + const int64_t query_end = + queries_row_splits[batch_idx + 1].Item(); + const Tensor points_i = points.Slice(0, point_begin, point_end); + + for (int64_t q = query_begin; q < query_end; ++q) { + const Tensor query = queries.Slice(0, q, q + 1).Flatten(); + counts[q] = CountQueryResultsWithinRadiusSYCL( + points_i, query, threshold); + } + } + + std::vector row_splits(num_queries + 1, 0); + for (int64_t q = 0; q < num_queries; ++q) { + row_splits[q + 1] = row_splits[q] + counts[q]; + } + neighbors_row_splits = + Tensor(row_splits, {num_queries + 1}, Int64).To(device); + + NeighborSearchAllocator output_allocator(device); + TIndex* neighbors_index_ptr; + T* neighbors_distance_ptr; + output_allocator.AllocIndices(&neighbors_index_ptr, row_splits.back()); + if (return_distances) { + output_allocator.AllocDistances(&neighbors_distance_ptr, + row_splits.back()); + } else { + output_allocator.AllocDistances(&neighbors_distance_ptr, 0); + } + + for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; + ++batch_idx) { + const int64_t point_begin = points_row_splits[batch_idx].Item(); + const int64_t point_end = points_row_splits[batch_idx + 1].Item(); + const int64_t query_begin = + queries_row_splits[batch_idx].Item(); + const int64_t query_end = + queries_row_splits[batch_idx + 1].Item(); + const Tensor points_i = points.Slice(0, point_begin, point_end); + + for (int64_t q = query_begin; q < query_end; ++q) { + if (counts[q] == 0) { + continue; + } + const Tensor query = queries.Slice(0, q, q + 1).Flatten(); + GatherQueryResultsWithinRadiusSYCL( + points_i, query, counts[q], TIndex(point_begin), + neighbors_index_ptr + row_splits[q], + return_distances ? neighbors_distance_ptr + row_splits[q] + : nullptr); + } + } + + neighbors_index = output_allocator.NeighborsIndex(); + neighbors_distance = output_allocator.NeighborsDistance(); +} + +template +void HybridSearchSYCL(const Tensor& points, + const Tensor& queries, + double radius, + int max_knn, + const Tensor& points_row_splits, + const Tensor& queries_row_splits, + const Tensor&, + const Tensor&, + const Tensor&, + const Metric metric, + Tensor& neighbors_index, + Tensor& neighbors_count, + Tensor& neighbors_distance) { + if (metric != Metric::L2) { + utility::LogError("SYCL hybrid search only supports L2 metric."); + } + + const Device device = points.GetDevice(); + const int64_t num_queries = queries.GetShape(0); + const T threshold = static_cast(radius * radius); + + NeighborSearchAllocator output_allocator(device); + TIndex* neighbors_index_ptr; + T* neighbors_distance_ptr; + output_allocator.AllocIndices(&neighbors_index_ptr, + num_queries * max_knn, TIndex(-1)); + output_allocator.AllocDistances(&neighbors_distance_ptr, + num_queries * max_knn, T(0)); + + std::vector counts(num_queries, 0); + for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; + ++batch_idx) { + const int64_t point_begin = points_row_splits[batch_idx].Item(); + const int64_t point_end = points_row_splits[batch_idx + 1].Item(); + const int64_t query_begin = + queries_row_splits[batch_idx].Item(); + const int64_t query_end = + queries_row_splits[batch_idx + 1].Item(); + const Tensor points_i = points.Slice(0, point_begin, point_end); + + for (int64_t q = query_begin; q < query_end; ++q) { + const Tensor query = queries.Slice(0, q, q + 1).Flatten(); + const int64_t num_neighbors = + CountQueryResultsWithinRadiusSYCL( + points_i, query, threshold); + counts[q] = static_cast(std::min(num_neighbors, + max_knn)); + if (counts[q] == 0) { + continue; + } + GatherQueryResultsWithinRadiusSYCL( + points_i, query, counts[q], TIndex(point_begin), + neighbors_index_ptr + q * max_knn, + neighbors_distance_ptr + q * max_knn); + } + } + + neighbors_index = output_allocator.NeighborsIndex(); + neighbors_distance = output_allocator.NeighborsDistance(); + neighbors_count = + Tensor(counts, {num_queries}, Dtype::FromType()).To(device); +} + +#define INSTANTIATE(T, TIndex) \ + template void KnnSearchSYCL( \ + const Tensor& points, const Tensor& points_row_splits, \ + const Tensor& queries, const Tensor& queries_row_splits, int knn, \ + Tensor& neighbors_index, Tensor& neighbors_row_splits, \ + Tensor& neighbors_distance); \ + template void FixedRadiusSearchSYCL( \ + const Tensor& points, const Tensor& queries, double radius, \ + const Tensor& points_row_splits, const Tensor& queries_row_splits, \ + const Tensor&, const Tensor&, const Tensor&, const Metric metric, \ + const bool ignore_query_point, const bool return_distances, \ + const bool sort, Tensor& neighbors_index, \ + Tensor& neighbors_row_splits, Tensor& neighbors_distance); \ + template void HybridSearchSYCL( \ + const Tensor& points, const Tensor& queries, double radius, \ + int max_knn, const Tensor& points_row_splits, \ + const Tensor& queries_row_splits, const Tensor&, const Tensor&, \ + const Tensor&, const Metric metric, Tensor& neighbors_index, \ + Tensor& neighbors_count, Tensor& neighbors_distance); + +INSTANTIATE(float, int32_t) +INSTANTIATE(float, int64_t) +INSTANTIATE(double, int32_t) +INSTANTIATE(double, int64_t) + +} // namespace nns +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.cpp b/cpp/open3d/core/nns/NearestNeighborSearch.cpp index 9cc8030a35b..920971beaed 100644 --- a/cpp/open3d/core/nns/NearestNeighborSearch.cpp +++ b/cpp/open3d/core/nns/NearestNeighborSearch.cpp @@ -21,21 +21,18 @@ bool NearestNeighborSearch::SetIndex() { }; bool NearestNeighborSearch::KnnIndex() { - if (dataset_points_.IsCUDA()) { -#ifdef BUILD_CUDA_MODULE + if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) { knn_index_.reset(new nns::KnnIndex()); return knn_index_->SetTensorData(dataset_points_, index_dtype_); -#else - utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please recompile Open3D with " - "-DBUILD_CUDA_MODULE=ON."); -#endif } else { return SetIndex(); } }; -bool NearestNeighborSearch::MultiRadiusIndex() { return SetIndex(); }; +bool NearestNeighborSearch::MultiRadiusIndex() { + AssertCPU(dataset_points_); + return SetIndex(); +}; bool NearestNeighborSearch::FixedRadiusIndex(std::optional radius) { if (dataset_points_.IsCUDA()) { @@ -53,6 +50,15 @@ bool NearestNeighborSearch::FixedRadiusIndex(std::optional radius) { #endif } else { + if (dataset_points_.IsSYCL()) { + if (!radius.has_value()) { + utility::LogError("radius is required for SYCL FixedRadiusIndex."); + } + fixed_radius_index_.reset(new nns::FixedRadiusIndex()); + return fixed_radius_index_->SetTensorData(dataset_points_, + radius.value(), + index_dtype_); + } return SetIndex(); } } @@ -72,6 +78,15 @@ bool NearestNeighborSearch::HybridIndex(std::optional radius) { #endif } else { + if (dataset_points_.IsSYCL()) { + if (!radius.has_value()) { + utility::LogError("radius is required for SYCL HybridIndex."); + } + fixed_radius_index_.reset(new nns::FixedRadiusIndex()); + return fixed_radius_index_->SetTensorData(dataset_points_, + radius.value(), + index_dtype_); + } return SetIndex(); } }; @@ -80,7 +95,7 @@ std::pair NearestNeighborSearch::KnnSearch( const Tensor& query_points, int knn) { AssertTensorDevice(query_points, dataset_points_.GetDevice()); - if (dataset_points_.IsCUDA()) { + if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) { if (knn_index_) { return knn_index_->SearchKnn(query_points, knn); } else { @@ -99,7 +114,7 @@ std::tuple NearestNeighborSearch::FixedRadiusSearch( const Tensor& query_points, double radius, bool sort) { AssertTensorDevice(query_points, dataset_points_.GetDevice()); - if (dataset_points_.IsCUDA()) { + if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) { if (fixed_radius_index_) { return fixed_radius_index_->SearchRadius(query_points, radius, sort); @@ -117,7 +132,7 @@ std::tuple NearestNeighborSearch::FixedRadiusSearch( std::tuple NearestNeighborSearch::MultiRadiusSearch( const Tensor& query_points, const Tensor& radii) { - AssertNotCUDA(query_points); + AssertCPU(query_points); AssertTensorDtype(query_points, dataset_points_.GetDtype()); AssertTensorDtype(radii, dataset_points_.GetDtype()); @@ -133,7 +148,7 @@ std::tuple NearestNeighborSearch::HybridSearch( const int max_knn) const { AssertTensorDevice(query_points, dataset_points_.GetDevice()); - if (dataset_points_.IsCUDA()) { + if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) { if (fixed_radius_index_) { return fixed_radius_index_->SearchHybrid(query_points, radius, max_knn); @@ -150,11 +165,11 @@ std::tuple NearestNeighborSearch::HybridSearch( } } -void NearestNeighborSearch::AssertNotCUDA(const Tensor& t) const { - if (t.IsCUDA()) { +void NearestNeighborSearch::AssertCPU(const Tensor& t) const { + if (!t.IsCPU()) { utility::LogError( - "TODO: NearestNeighborSearch does not support CUDA tensor " - "yet."); + "NearestNeighborSearch only supports CPU tensors for this " + "operation."); } } diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.h b/cpp/open3d/core/nns/NearestNeighborSearch.h index b9fe14ad1ed..aa22ca352b5 100644 --- a/cpp/open3d/core/nns/NearestNeighborSearch.h +++ b/cpp/open3d/core/nns/NearestNeighborSearch.h @@ -27,14 +27,12 @@ class NearestNeighborSearch { /// Constructor. /// /// \param dataset_points Dataset points for constructing search index. Must - /// be 2D, with shape {n, d}. SYCL tensors are not yet supported. + /// be 2D, with shape {n, d}. // NearestNeighborSearch(const Tensor &dataset_points) // : dataset_points_(dataset_points){}; NearestNeighborSearch(const Tensor &dataset_points, const Dtype &index_dtype = core::Int32) - : dataset_points_(dataset_points), index_dtype_(index_dtype) { - AssertNotSYCL(dataset_points_); - }; + : dataset_points_(dataset_points), index_dtype_(index_dtype) {} ~NearestNeighborSearch(); NearestNeighborSearch(const NearestNeighborSearch &) = delete; NearestNeighborSearch &operator=(const NearestNeighborSearch &) = delete; @@ -47,17 +45,22 @@ class NearestNeighborSearch { /// Set index for multi-radius search. /// + /// Multi-radius search is currently only supported for CPU tensors. + /// /// \return Returns true if building index success, otherwise false. bool MultiRadiusIndex(); /// Set index for fixed-radius search. /// - /// \param radius optional radius parameter. required for gpu fixed radius - /// index. \return Returns true if building index success, otherwise false. + /// \param radius Optional radius parameter. Required for CUDA and SYCL + /// fixed radius indices. + /// \return Returns true if building index success, otherwise false. bool FixedRadiusIndex(std::optional radius = {}); /// Set index for hybrid search. /// + /// \param radius Optional radius parameter. Required for CUDA and SYCL + /// hybrid indices. /// \return Returns true if building index success, otherwise false. bool HybridIndex(std::optional radius = {}); @@ -88,6 +91,7 @@ class NearestNeighborSearch { const Tensor &query_points, double radius, bool sort = true); /// Perform multi-radius search. Each query point has an independent radius. + /// Only CPU tensors are currently supported for this search mode. /// /// \param query_points Query points. Must be 2D, with shape {n, d}. /// \param radii Radii of query points. Each query point has one radius. @@ -121,8 +125,8 @@ class NearestNeighborSearch { private: bool SetIndex(); - /// Assert a Tensor is not CUDA tensoer. This will be removed in the future. - void AssertNotCUDA(const Tensor &t) const; + /// Assert a Tensor is on CPU. + void AssertCPU(const Tensor &t) const; protected: std::unique_ptr nanoflann_index_; diff --git a/cpp/pybind/core/nns/nearest_neighbor_search.cpp b/cpp/pybind/core/nns/nearest_neighbor_search.cpp index 3bb647592a0..99394d8681a 100644 --- a/cpp/pybind/core/nns/nearest_neighbor_search.cpp +++ b/cpp/pybind/core/nns/nearest_neighbor_search.cpp @@ -84,7 +84,7 @@ This function needs to be called once before performing search operations. Args: radius (float, optional): Radius value for fixed-radius search. Required - for GPU fixed radius index. + for CUDA and SYCL fixed radius indices. Returns: True on success. @@ -115,7 +115,7 @@ This function needs to be called once before performing search operations. Args: radius (float, optional): Radius value for hybrid search. Required - for GPU hybrid index. + for CUDA and SYCL hybrid indices. Returns: True on success. @@ -130,7 +130,8 @@ This function needs to be called once before performing search operations. To use knn_search initialize the index using knn_index before calling this function. Args: - query_points (open3d.core.Tensor): Query points with shape {n, d}. + query_points (open3d.core.Tensor): Query points with shape {n, d}. CPU, + CUDA, and SYCL devices are supported. knn (int): Number of neighbors to search per query point. Example: @@ -179,6 +180,7 @@ This function needs to be called once before performing search operations. Args: query_points (open3d.core.Tensor): Query points with shape {n, d}. + CPU, CUDA, and SYCL devices are supported. radius (float): Radius value for fixed-radius search. Note that this parameter can differ from the radius used to initialize the index for convenience, which may cause the index to be rebuilt for GPU @@ -234,7 +236,8 @@ This function needs to be called once before performing search operations. To use multi_radius_search initialize the index using multi_radius_index before calling this function. Args: - query_points (open3d.core.Tensor): Query points with shape {n, d}. + query_points (open3d.core.Tensor): Query points with shape {n, d}. Only + CPU tensors are supported. radii (open3d.core.Tensor): Radii of query points. Each query point has one radius. Returns: @@ -292,7 +295,8 @@ Hybrid search behaves similarly to fixed-radius search, but with a maximum numbe To use hybrid_search initialize the index using hybrid_index before calling this function. Args: - query_points (open3d.core.Tensor): Query points with shape {n, d}. + query_points (open3d.core.Tensor): Query points with shape {n, d}. CPU, + CUDA, and SYCL devices are supported. radius (float): Radius value for hybrid search. max_knn (int): Maximum number of neighbor to search per query. diff --git a/cpp/tests/core/NearestNeighborSearch.cpp b/cpp/tests/core/NearestNeighborSearch.cpp index d282e8ecd9a..547d3829a06 100644 --- a/cpp/tests/core/NearestNeighborSearch.cpp +++ b/cpp/tests/core/NearestNeighborSearch.cpp @@ -22,7 +22,7 @@ namespace open3d { namespace tests { -class NNSPermuteDevices : public PermuteDevices {}; +class NNSPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(NearestNeighborSearch, NNSPermuteDevices, testing::ValuesIn(PermuteDevices::TestCases())); diff --git a/python/test/core/test_nn.py b/python/test/core/test_nn.py index 48b7d2682cd..b5376fec105 100644 --- a/python/test/core/test_nn.py +++ b/python/test/core/test_nn.py @@ -19,7 +19,7 @@ np.random.seed(0) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_knn_index(device): dtype = o3c.float32 @@ -34,7 +34,7 @@ def test_knn_index(device): assert nns.multi_radius_index() -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_knn_search(device): dtype = o3c.float32 @@ -107,7 +107,7 @@ def test_knn_search(device): rtol=1e-5) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64]) def test_fixed_radius_search(device, dtype): dataset_points = o3c.Tensor( From 26e069761be69d3255ca51bb262c5b64e964fc06 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 07:32:25 +0000 Subject: [PATCH 11/14] Polish SYCL nearest-neighbor messages Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com> --- cpp/open3d/core/nns/FixedRadiusIndex.cpp | 6 +++--- cpp/open3d/core/nns/KnnIndex.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.cpp b/cpp/open3d/core/nns/FixedRadiusIndex.cpp index b1254c396e1..a7316b1c8ff 100644 --- a/cpp/open3d/core/nns/FixedRadiusIndex.cpp +++ b/cpp/open3d/core/nns/FixedRadiusIndex.cpp @@ -104,7 +104,7 @@ bool FixedRadiusIndex::SetTensorData(const Tensor &dataset_points, CALL_BUILD(double, BuildSpatialHashTableCUDA) #else utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " + "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with " "-DBUILD_CUDA_MODULE=ON."); #endif } else if (device.IsSYCL()) { @@ -179,7 +179,7 @@ std::tuple FixedRadiusIndex::SearchRadius( }); #else utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " + "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with " "-DBUILD_CUDA_MODULE=ON."); #endif } else if (device.IsSYCL()) { @@ -258,7 +258,7 @@ std::tuple FixedRadiusIndex::SearchHybrid( }); #else utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " + "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with " "-DBUILD_CUDA_MODULE=ON."); #endif } else if (device.IsSYCL()) { diff --git a/cpp/open3d/core/nns/KnnIndex.cpp b/cpp/open3d/core/nns/KnnIndex.cpp index 596ad92d69d..a2c24233a2e 100644 --- a/cpp/open3d/core/nns/KnnIndex.cpp +++ b/cpp/open3d/core/nns/KnnIndex.cpp @@ -135,7 +135,7 @@ std::pair KnnIndex::SearchKnn(const Tensor& query_points, }); #else utility::LogError( - "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with " + "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with " "-DBUILD_CUDA_MODULE=ON."); #endif } else if (device.IsSYCL()) { From e10955f817908d7606f8a4a360f324fb1435c854 Mon Sep 17 00:00:00 2001 From: Sameer Sheorey Date: Sun, 15 Mar 2026 23:16:09 -0700 Subject: [PATCH 12/14] Optimized implementation with tiles. --- cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp | 901 +++++++++++++++++------ cpp/tests/core/CMakeLists.txt | 19 +- cpp/tests/core/KnnIndex.cpp | 34 +- 3 files changed, 698 insertions(+), 256 deletions(-) diff --git a/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp index 5bb4f2d02a7..2d20e4b86fa 100644 --- a/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp +++ b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp @@ -6,14 +6,14 @@ // ---------------------------------------------------------------------------- #include +#include #include #include #include -#include -#include "open3d/core/MemoryManager.h" #include "open3d/core/SYCLContext.h" #include "open3d/core/Tensor.h" +#include "open3d/core/linalg/AddMM.h" #include "open3d/core/nns/FixedRadiusIndex.h" #include "open3d/core/nns/KnnIndex.h" #include "open3d/core/nns/NeighborSearchAllocator.h" @@ -24,8 +24,27 @@ namespace open3d { namespace core { namespace nns { +// SYCL nearest-neighbor search uses tiled distance evaluation with AddMM, +// batched threshold/count helpers, and query-local top-k selection/merge. namespace { +// Chooses a query/point tile shape that limits temporary distance storage. +inline void ChooseTileSizeSYCL(int64_t num_queries, + int64_t num_points, + int64_t element_size, + int64_t& tile_queries, + int64_t& tile_points) { + constexpr int64_t kTargetTileBytes = 8 * 1024 * 1024; + tile_queries = std::min(num_queries, 128); + tile_queries = std::max(tile_queries, 1); + tile_points = std::max( + kTargetTileBytes / (tile_queries * element_size), int64_t(256)); + tile_points = std::min(tile_points, num_points); + tile_points = std::max(tile_points, 1); +} + +// Computes squared L2 distances from one query to all points with a SYCL +// kernel. This path is still used by the small single-row helpers below. template void ComputeSquaredDistancesSYCL(const Device& device, const T* points_ptr, @@ -40,199 +59,323 @@ void ComputeSquaredDistancesSYCL(const Device& device, constexpr int64_t kWorkGroupSize = 128; const int64_t global_size = utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize; - sycl::queue queue = - sy::SYCLContext::GetInstance().GetDefaultQueue(device); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); queue.submit([&](sycl::handler& cgh) { - sycl::local_accessor local_query( - sycl::range<1>(dimension), cgh); - cgh.parallel_for( - sycl::nd_range<1>(sycl::range<1>(global_size), - sycl::range<1>(kWorkGroupSize)), - [=](sycl::nd_item<1> item) { - const int64_t lid = item.get_local_linear_id(); - for (int64_t d = lid; d < dimension; - d += item.get_local_range(0)) { - local_query[d] = query_ptr[d]; - } - item.barrier(sycl::access::fence_space::local_space); - - const int64_t point_idx = item.get_global_linear_id(); - if (point_idx >= num_points) { - return; - } - - const T* point = points_ptr + point_idx * dimension; - T dist = 0; - for (int64_t d = 0; d < dimension; ++d) { - const T diff = point[d] - local_query[d]; - dist += diff * diff; - } - distances_ptr[point_idx] = dist; - }); - }).wait_and_throw(); + sycl::local_accessor local_query(sycl::range<1>(dimension), cgh); + cgh.parallel_for( + sycl::nd_range<1>(sycl::range<1>(global_size), + sycl::range<1>(kWorkGroupSize)), + [=](sycl::nd_item<1> item) { + const int64_t lid = item.get_local_linear_id(); + for (int64_t d = lid; d < dimension; + d += item.get_local_range(0)) { + local_query[d] = query_ptr[d]; + } + item.barrier(sycl::access::fence_space::local_space); + + const int64_t point_idx = item.get_global_linear_id(); + if (point_idx >= num_points) { + return; + } + + const T* point = points_ptr + point_idx * dimension; + T dist = 0; + for (int64_t d = 0; d < dimension; ++d) { + const T diff = point[d] - local_query[d]; + dist += diff * diff; + } + distances_ptr[point_idx] = dist; + }); + }); } -template -void SortIndicesByDistanceSYCL(const Device& device, - const T* distances_ptr, - TIndex* indices_ptr, - int64_t num_points) { - if (num_points <= 1) { +// Counts how many entries per query are within a threshold using one launch per +// tile, avoiding per-query device algorithm dispatch. +template +void CountWithinThresholdQueriesSYCL(const Device& device, + const T* distances_ptr, + int64_t distance_query_stride, + int64_t num_queries, + int64_t num_points, + T threshold, + int64_t* counts_ptr) { + if (num_queries == 0 || num_points == 0) { return; } - sycl::queue queue = - sy::SYCLContext::GetInstance().GetDefaultQueue(device); - auto policy = oneapi::dpl::execution::make_device_policy(queue); - std::sort(policy, indices_ptr, indices_ptr + num_points, - [distances_ptr](TIndex lhs, TIndex rhs) { - const T lhs_dist = distances_ptr[lhs]; - const T rhs_dist = distances_ptr[rhs]; - if (lhs_dist < rhs_dist) { - return true; - } - if (rhs_dist < lhs_dist) { - return false; - } - return lhs < rhs; - }); - queue.wait_and_throw(); + + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) { + const int64_t query_idx = id[0]; + int64_t local_count = 0; + const T* query_distances = + distances_ptr + query_idx * distance_query_stride; + for (int64_t point_idx = 0; point_idx < num_points; ++point_idx) { + if (query_distances[point_idx] <= threshold) { + ++local_count; + } + } + counts_ptr[query_idx] += local_count; + }); } +// Appends per-query threshold matches into the final neighbor buffers using +// caller-provided query offsets. template -int64_t CountWithinRadiusSYCL(const Device& device, - const T* distances_ptr, - const TIndex* sorted_indices_ptr, - int64_t num_points, - T threshold) { - if (num_points == 0) { - return 0; +void GatherWithinThresholdQueriesSYCL(const Device& device, + const T* distances_ptr, + int64_t distance_query_stride, + int64_t num_queries, + int64_t num_points, + T threshold, + TIndex index_offset, + int64_t* offsets_ptr, + TIndex* out_indices_ptr, + T* out_distances_ptr) { + if (num_queries == 0 || num_points == 0) { + return; } - constexpr int64_t kWorkGroupSize = 128; - const int64_t global_size = - utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize; - Tensor count = Tensor::Zeros({1}, Int64, device); - int64_t* count_ptr = count.GetDataPtr(); - sycl::queue queue = - sy::SYCLContext::GetInstance().GetDefaultQueue(device); - - queue.submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl::nd_range<1>(sycl::range<1>(global_size), - sycl::range<1>(kWorkGroupSize)), - [=](sycl::nd_item<1> item) { - const int64_t idx = item.get_global_linear_id(); - int local_count = 0; - if (idx < num_points) { - local_count = - distances_ptr[sorted_indices_ptr[idx]] <= - threshold - ? 1 - : 0; - } - auto group = item.get_group(); - const int group_count = sycl::reduce_over_group( - group, local_count, sycl::plus()); - if (item.get_local_linear_id() == 0 && - group_count > 0) { - sycl::atomic_ref - count_ref(*count_ptr); - count_ref += group_count; - } - }); - }).wait_and_throw(); - - return count.To(Device("CPU:0")).Item(); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) { + const int64_t query_idx = id[0]; + const T* query_distances = + distances_ptr + query_idx * distance_query_stride; + int64_t offset = offsets_ptr[query_idx]; + for (int64_t point_idx = 0; point_idx < num_points; ++point_idx) { + const T dist = query_distances[point_idx]; + if (dist <= threshold) { + out_indices_ptr[offset] = index_offset + point_idx; + if (out_distances_ptr != nullptr) { + out_distances_ptr[offset] = dist; + } + ++offset; + } + } + offsets_ptr[query_idx] = offset; + }); } +// Selects the smallest k distances per query with OneDPL partial_sort while +// reusing scratch index buffers allocated by the caller. template -void GatherResultsSYCL(const Device& device, - const T* distances_ptr, - const TIndex* sorted_indices_ptr, - int64_t num_results, - TIndex index_offset, - TIndex* out_indices_ptr, - T* out_distances_ptr) { - if (num_results == 0) { +void SelectTopKQueriesSYCL(const Device& device, + const T* distances_ptr, + int64_t distance_query_stride, + int64_t num_queries, + int64_t num_points, + int64_t knn, + TIndex index_offset, + TIndex* scratch_indices_ptr, + int64_t scratch_query_stride, + TIndex* out_indices_ptr, + T* out_distances_ptr, + int64_t out_query_stride, + bool use_threshold = false, + T threshold = T(0)) { + if (num_queries == 0 || num_points == 0 || knn <= 0) { return; } - sycl::queue queue = - sy::SYCLContext::GetInstance().GetDefaultQueue(device); - queue.parallel_for(sycl::range<1>(num_results), [=](sycl::id<1> id) { - const int64_t i = id[0]; - const TIndex point_idx = sorted_indices_ptr[i]; - out_indices_ptr[i] = point_idx + index_offset; - if (out_distances_ptr != nullptr) { - out_distances_ptr[i] = distances_ptr[point_idx]; - } - }).wait_and_throw(); -} + const T inf = std::numeric_limits::max(); + const int64_t actual_knn = std::min(knn, num_points); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto policy = oneapi::dpl::execution::make_device_policy(queue); -template -void ProcessQuerySYCL(const Tensor& points, - const Tensor& query, - int64_t num_results, - TIndex index_offset, - TIndex* out_indices_ptr, - T* out_distances_ptr) { - const Device device = points.GetDevice(); - const int64_t num_points = points.GetShape(0); - - Tensor distances = Tensor::Empty({num_points}, points.GetDtype(), device); - Tensor sorted_indices = Tensor::Arange( - 0, num_points, 1, Dtype::FromType(), device); - - ComputeSquaredDistancesSYCL(device, points.GetDataPtr(), num_points, - points.GetShape(1), query.GetDataPtr(), - distances.GetDataPtr()); - SortIndicesByDistanceSYCL(device, distances.GetDataPtr(), - sorted_indices.GetDataPtr(), - num_points); - GatherResultsSYCL(device, distances.GetDataPtr(), - sorted_indices.GetDataPtr(), - num_results, index_offset, out_indices_ptr, - out_distances_ptr); + queue.parallel_for( + sycl::range<2>(num_queries, num_points), [=](sycl::id<2> id) { + scratch_indices_ptr[id[0] * scratch_query_stride + id[1]] = + static_cast(id[1]); + }); + + for (int64_t query_idx = 0; query_idx < num_queries; ++query_idx) { + TIndex* query_indices = + scratch_indices_ptr + query_idx * scratch_query_stride; + const T* query_distances = + distances_ptr + query_idx * distance_query_stride; + std::partial_sort(policy, query_indices, query_indices + actual_knn, + query_indices + num_points, + [query_distances](TIndex lhs, TIndex rhs) { + const T lhs_dist = query_distances[lhs]; + const T rhs_dist = query_distances[rhs]; + if (lhs_dist < rhs_dist) { + return true; + } + if (rhs_dist < lhs_dist) { + return false; + } + return lhs < rhs; + }); + } + + queue.parallel_for(sycl::range<2>(num_queries, knn), [=](sycl::id<2> id) { + const int64_t query_idx = id[0]; + const int64_t k = id[1]; + TIndex* query_out_indices = + out_indices_ptr + query_idx * out_query_stride; + T* query_out_distances = + out_distances_ptr + query_idx * out_query_stride; + + if (k >= actual_knn) { + query_out_indices[k] = TIndex(-1); + query_out_distances[k] = inf; + return; + } + + const TIndex local_idx = + scratch_indices_ptr[query_idx * scratch_query_stride + k]; + const T dist = + distances_ptr[query_idx * distance_query_stride + local_idx]; + if (use_threshold && dist > threshold) { + query_out_indices[k] = TIndex(-1); + query_out_distances[k] = inf; + return; + } + query_out_indices[k] = index_offset + local_idx; + query_out_distances[k] = dist; + }); } +// Merges two sorted top-k query buffers into a new top-k query buffer with +// OneDPL partial_sort over 2k candidates. template -int64_t CountQueryResultsWithinRadiusSYCL(const Tensor& points, - const Tensor& query, - T threshold) { - const Device device = points.GetDevice(); - const int64_t num_points = points.GetShape(0); - - Tensor distances = Tensor::Empty({num_points}, points.GetDtype(), device); - Tensor sorted_indices = Tensor::Arange( - 0, num_points, 1, Dtype::FromType(), device); - - ComputeSquaredDistancesSYCL(device, points.GetDataPtr(), num_points, - points.GetShape(1), query.GetDataPtr(), - distances.GetDataPtr()); - SortIndicesByDistanceSYCL(device, distances.GetDataPtr(), - sorted_indices.GetDataPtr(), - num_points); - return CountWithinRadiusSYCL( - device, distances.GetDataPtr(), sorted_indices.GetDataPtr(), - num_points, threshold); +void MergeTopKQueriesSYCL(const Device& device, + const T* current_distances_ptr, + const TIndex* current_indices_ptr, + int64_t current_query_stride, + const T* candidate_distances_ptr, + const TIndex* candidate_indices_ptr, + int64_t candidate_query_stride, + int64_t num_queries, + int64_t knn, + TIndex* scratch_indices_ptr, + int64_t scratch_query_stride, + TIndex* out_indices_ptr, + T* out_distances_ptr, + int64_t out_query_stride) { + if (num_queries == 0 || knn <= 0) { + return; + } + + const T inf = std::numeric_limits::max(); + const int64_t combined_cols = 2 * knn; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + + queue.parallel_for( + sycl::range<2>(num_queries, combined_cols), [=](sycl::id<2> id) { + scratch_indices_ptr[id[0] * scratch_query_stride + id[1]] = + static_cast(id[1]); + }); + + for (int64_t query_idx = 0; query_idx < num_queries; ++query_idx) { + TIndex* query_indices = + scratch_indices_ptr + query_idx * scratch_query_stride; + const T* query_current_dist = + current_distances_ptr + query_idx * current_query_stride; + const TIndex* query_current_idx = + current_indices_ptr + query_idx * current_query_stride; + const T* query_candidate_dist = + candidate_distances_ptr + query_idx * candidate_query_stride; + const TIndex* query_candidate_idx = + candidate_indices_ptr + query_idx * candidate_query_stride; + + std::partial_sort( + policy, query_indices, query_indices + knn, + query_indices + combined_cols, + [query_current_dist, query_current_idx, query_candidate_dist, + query_candidate_idx, knn](TIndex lhs, TIndex rhs) { + const bool lhs_is_current = lhs < knn; + const bool rhs_is_current = rhs < knn; + const TIndex lhs_idx = + lhs_is_current ? query_current_idx[lhs] + : query_candidate_idx[lhs - knn]; + const TIndex rhs_idx = + rhs_is_current ? query_current_idx[rhs] + : query_candidate_idx[rhs - knn]; + const T lhs_dist = + lhs_is_current ? query_current_dist[lhs] + : query_candidate_dist[lhs - knn]; + const T rhs_dist = + rhs_is_current ? query_current_dist[rhs] + : query_candidate_dist[rhs - knn]; + const bool lhs_valid = lhs_idx >= 0; + const bool rhs_valid = rhs_idx >= 0; + if (lhs_valid != rhs_valid) { + return lhs_valid; + } + if (lhs_dist < rhs_dist) { + return true; + } + if (rhs_dist < lhs_dist) { + return false; + } + return lhs_idx < rhs_idx; + }); + } + + queue.parallel_for(sycl::range<2>(num_queries, knn), [=](sycl::id<2> id) { + const int64_t query_idx = id[0]; + const int64_t k = id[1]; + TIndex* query_out_indices = + out_indices_ptr + query_idx * out_query_stride; + T* query_out_distances = + out_distances_ptr + query_idx * out_query_stride; + const TIndex source = + scratch_indices_ptr[query_idx * scratch_query_stride + k]; + const bool is_current = source < knn; + const int64_t offset = is_current ? source : source - knn; + const TIndex idx = + is_current + ? current_indices_ptr[query_idx * current_query_stride + + offset] + : candidate_indices_ptr[query_idx * + candidate_query_stride + + offset]; + const T dist = + is_current + ? current_distances_ptr[query_idx * + current_query_stride + + offset] + : candidate_distances_ptr + [query_idx * candidate_query_stride + offset]; + if (idx < 0) { + query_out_indices[k] = TIndex(-1); + query_out_distances[k] = inf; + } else { + query_out_indices[k] = idx; + query_out_distances[k] = dist; + } + }); } +// Finalizes hybrid-search counts and clears any unused tail elements in the +// fixed-width output tensors. template -void GatherQueryResultsWithinRadiusSYCL(const Tensor& points, - const Tensor& query, - int64_t num_results, - TIndex index_offset, - TIndex* out_indices_ptr, - T* out_distances_ptr) { - ProcessQuerySYCL(points, query, num_results, index_offset, - out_indices_ptr, out_distances_ptr); +void FinalizeHybridResultsSYCL(const Device& device, + const int64_t* counts_ptr, + int64_t num_queries, + int64_t max_knn, + TIndex* neighbors_index_ptr, + T* neighbors_distance_ptr, + TIndex* neighbors_count_ptr) { + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) { + const int64_t query_idx = id[0]; + const int64_t clipped_count = + std::min(counts_ptr[query_idx], max_knn); + neighbors_count_ptr[query_idx] = static_cast(clipped_count); + for (int64_t k = clipped_count; k < max_knn; ++k) { + neighbors_index_ptr[query_idx * max_knn + k] = TIndex(-1); + neighbors_distance_ptr[query_idx * max_knn + k] = T(0); + } + }); } } // namespace +// Batched KNN search with tiled AddMM distance evaluation and query-local top-k +// selection/merge on each tile. template void KnnSearchSYCL(const Tensor& points, const Tensor& points_row_splits, @@ -243,6 +386,9 @@ void KnnSearchSYCL(const Tensor& points, Tensor& neighbors_row_splits, Tensor& neighbors_distance) { const Device device = points.GetDevice(); + const Dtype dtype = points.GetDtype(); + const Dtype index_dtype = Dtype::FromType(); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); const int batch_size = points_row_splits.GetShape(0) - 1; std::vector> batch_output_allocators( batch_size, NeighborSearchAllocator(device)); @@ -253,8 +399,10 @@ void KnnSearchSYCL(const Tensor& points, int64_t batch_knn = 0; for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) { - const int64_t point_begin = points_row_splits[batch_idx].Item(); - const int64_t point_end = points_row_splits[batch_idx + 1].Item(); + const int64_t point_begin = + points_row_splits[batch_idx].Item(); + const int64_t point_end = + points_row_splits[batch_idx + 1].Item(); const int64_t query_begin = queries_row_splits[batch_idx].Item(); const int64_t query_end = @@ -280,20 +428,119 @@ void KnnSearchSYCL(const Tensor& points, batch_output_allocators[batch_idx].AllocDistances( &distances_ptr, num_queries_i * batch_knn, T(0)); - for (int64_t q = 0; q < num_queries_i; ++q) { - const Tensor query = queries_i.Slice(0, q, q + 1).Flatten(); - ProcessQuerySYCL( - points_i, query, batch_knn, TIndex(point_begin), - indices_ptr + q * batch_knn, distances_ptr + q * batch_knn); + Tensor out_indices = + batch_output_allocators[batch_idx].NeighborsIndex().View( + {num_queries_i, batch_knn}); + Tensor out_distances = + batch_output_allocators[batch_idx].NeighborsDistance().View( + {num_queries_i, batch_knn}); + + // Precompute norms for batched distance evaluation. + Tensor point_norms = points_i.Mul(points_i).Sum({1}); + Tensor query_norms = queries_i.Mul(queries_i).Sum({1}); + + int64_t tile_queries = 0; + int64_t tile_points = 0; + ChooseTileSizeSYCL(num_queries_i, num_points_i, sizeof(T), tile_queries, + tile_points); + + Tensor temp_distances = + Tensor::Empty({tile_queries, tile_points}, dtype, device); + Tensor tile_sort_indices = + Tensor::Empty({tile_queries, tile_points}, index_dtype, device); + Tensor tile_top_indices = + Tensor::Empty({tile_queries, batch_knn}, index_dtype, device); + Tensor tile_top_distances = + Tensor::Empty({tile_queries, batch_knn}, dtype, device); + Tensor best_indices = + Tensor::Empty({tile_queries, batch_knn}, index_dtype, device); + Tensor best_distances = + Tensor::Empty({tile_queries, batch_knn}, dtype, device); + Tensor merged_indices = + Tensor::Empty({tile_queries, batch_knn}, index_dtype, device); + Tensor merged_distances = + Tensor::Empty({tile_queries, batch_knn}, dtype, device); + Tensor merge_sort_indices = Tensor::Empty({tile_queries, 2 * batch_knn}, + index_dtype, device); + + for (int64_t q = 0; q < num_queries_i; q += tile_queries) { + int64_t num_queries_iter = + std::min(tile_queries, num_queries_i - q); + Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter); + Tensor query_norms_tile = + query_norms.Slice(0, q, q + num_queries_iter); + + Tensor best_indices_view = + best_indices.Slice(0, 0, num_queries_iter); + Tensor best_distances_view = + best_distances.Slice(0, 0, num_queries_iter); + best_indices_view.Fill(TIndex(-1)); + best_distances_view.Fill(std::numeric_limits::max()); + + for (int64_t p = 0; p < num_points_i; p += tile_points) { + int64_t num_points_iter = + std::min(tile_points, num_points_i - p); + Tensor points_tile = points_i.Slice(0, p, p + num_points_iter); + Tensor point_norms_tile = + point_norms.Slice(0, p, p + num_points_iter); + Tensor temp_distances_view = + temp_distances.Slice(0, 0, num_queries_iter) + .Slice(1, 0, num_points_iter); + + AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0, + 0.0); + temp_distances_view.Add_( + point_norms_tile.View({1, num_points_iter})); + temp_distances_view.Add_( + query_norms_tile.View({num_queries_iter, 1})); + + Tensor tile_top_indices_view = + tile_top_indices.Slice(0, 0, num_queries_iter); + Tensor tile_top_distances_view = + tile_top_distances.Slice(0, 0, num_queries_iter); + Tensor merged_indices_view = + merged_indices.Slice(0, 0, num_queries_iter); + Tensor merged_distances_view = + merged_distances.Slice(0, 0, num_queries_iter); + + SelectTopKQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, batch_knn, TIndex(p), + tile_sort_indices.GetDataPtr(), + tile_sort_indices.GetStride(0), + tile_top_indices_view.GetDataPtr(), + tile_top_distances_view.GetDataPtr(), batch_knn); + MergeTopKQueriesSYCL( + device, best_distances_view.GetDataPtr(), + best_indices_view.GetDataPtr(), batch_knn, + tile_top_distances_view.GetDataPtr(), + tile_top_indices_view.GetDataPtr(), batch_knn, + num_queries_iter, batch_knn, + merge_sort_indices.GetDataPtr(), + merge_sort_indices.GetStride(0), + merged_indices_view.GetDataPtr(), + merged_distances_view.GetDataPtr(), batch_knn); + + best_indices_view.AsRvalue() = merged_indices_view; + best_distances_view.AsRvalue() = merged_distances_view; + } + + out_indices.Slice(0, q, q + num_queries_iter).AsRvalue() = + best_indices_view; + out_distances.Slice(0, q, q + num_queries_iter).AsRvalue() = + best_distances_view; } + + queue.wait_and_throw(); } if (batch_size == 1) { - neighbors_index = - batch_output_allocators[0].NeighborsIndex().View({queries.GetShape(0), - batch_knn}); - neighbors_distance = batch_output_allocators[0].NeighborsDistance().View( + neighbors_index = batch_output_allocators[0].NeighborsIndex().View( {queries.GetShape(0), batch_knn}); + neighbors_distance = + batch_output_allocators[0].NeighborsDistance().View( + {queries.GetShape(0), batch_knn}); return; } @@ -326,6 +573,8 @@ void KnnSearchSYCL(const Tensor& points, neighbors_distance = output_allocator.NeighborsDistance(); } +// Fixed-radius search using two tiled passes: one to count matches and one to +// gather indices and optional distances. template void FixedRadiusSearchSYCL(const Tensor& points, const Tensor& queries, @@ -347,38 +596,88 @@ void FixedRadiusSearchSYCL(const Tensor& points, } if (ignore_query_point) { utility::LogError( - "SYCL fixed radius search does not support ignore_query_point."); + "SYCL fixed radius search does not support " + "ignore_query_point."); } const Device device = points.GetDevice(); const int64_t num_queries = queries.GetShape(0); + const Dtype dtype = points.GetDtype(); const T threshold = static_cast(radius * radius); - std::vector counts(num_queries, 0); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + Tensor counts = Tensor::Zeros({num_queries}, Int64, device); + + int64_t tile_queries = 0; + int64_t tile_points = 0; + ChooseTileSizeSYCL(num_queries, points.GetShape(0), sizeof(T), tile_queries, + tile_points); + Tensor temp_distances = + Tensor::Empty({tile_queries, tile_points}, dtype, device); for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; ++batch_idx) { - const int64_t point_begin = points_row_splits[batch_idx].Item(); - const int64_t point_end = points_row_splits[batch_idx + 1].Item(); + const int64_t point_begin = + points_row_splits[batch_idx].Item(); + const int64_t point_end = + points_row_splits[batch_idx + 1].Item(); const int64_t query_begin = queries_row_splits[batch_idx].Item(); const int64_t query_end = queries_row_splits[batch_idx + 1].Item(); const Tensor points_i = points.Slice(0, point_begin, point_end); - - for (int64_t q = query_begin; q < query_end; ++q) { - const Tensor query = queries.Slice(0, q, q + 1).Flatten(); - counts[q] = CountQueryResultsWithinRadiusSYCL( - points_i, query, threshold); + const Tensor queries_i = queries.Slice(0, query_begin, query_end); + const int64_t num_points_i = points_i.GetShape(0); + const int64_t num_queries_i = queries_i.GetShape(0); + Tensor point_norms = points_i.Mul(points_i).Sum({1}); + Tensor query_norms = queries_i.Mul(queries_i).Sum({1}); + + for (int64_t q = 0; q < num_queries_i; q += tile_queries) { + const int64_t num_queries_iter = + std::min(tile_queries, num_queries_i - q); + Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter); + Tensor query_norms_tile = + query_norms.Slice(0, q, q + num_queries_iter); + int64_t* counts_ptr = + counts.GetDataPtr() + query_begin + q; + + for (int64_t p = 0; p < num_points_i; p += tile_points) { + const int64_t num_points_iter = + std::min(tile_points, num_points_i - p); + Tensor points_tile = points_i.Slice(0, p, p + num_points_iter); + Tensor point_norms_tile = + point_norms.Slice(0, p, p + num_points_iter); + Tensor temp_distances_view = + temp_distances.Slice(0, 0, num_queries_iter) + .Slice(1, 0, num_points_iter); + + AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0, + 0.0); + temp_distances_view.Add_( + point_norms_tile.View({1, num_points_iter})); + temp_distances_view.Add_( + query_norms_tile.View({num_queries_iter, 1})); + CountWithinThresholdQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, threshold, counts_ptr); + } } } + queue.wait_and_throw(); + + Tensor counts_cpu = counts.To(Device("CPU:0")); + const int64_t* counts_cpu_ptr = counts_cpu.GetDataPtr(); std::vector row_splits(num_queries + 1, 0); for (int64_t q = 0; q < num_queries; ++q) { - row_splits[q + 1] = row_splits[q] + counts[q]; + row_splits[q + 1] = row_splits[q] + counts_cpu_ptr[q]; } neighbors_row_splits = Tensor(row_splits, {num_queries + 1}, Int64).To(device); + std::vector row_offsets(row_splits.begin(), row_splits.end() - 1); + Tensor write_offsets = Tensor(row_offsets, {num_queries}, Int64).To(device); + NeighborSearchAllocator output_allocator(device); TIndex* neighbors_index_ptr; T* neighbors_distance_ptr; @@ -392,31 +691,64 @@ void FixedRadiusSearchSYCL(const Tensor& points, for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; ++batch_idx) { - const int64_t point_begin = points_row_splits[batch_idx].Item(); - const int64_t point_end = points_row_splits[batch_idx + 1].Item(); + const int64_t point_begin = + points_row_splits[batch_idx].Item(); + const int64_t point_end = + points_row_splits[batch_idx + 1].Item(); const int64_t query_begin = queries_row_splits[batch_idx].Item(); const int64_t query_end = queries_row_splits[batch_idx + 1].Item(); const Tensor points_i = points.Slice(0, point_begin, point_end); - - for (int64_t q = query_begin; q < query_end; ++q) { - if (counts[q] == 0) { - continue; + const Tensor queries_i = queries.Slice(0, query_begin, query_end); + const int64_t num_points_i = points_i.GetShape(0); + const int64_t num_queries_i = queries_i.GetShape(0); + Tensor point_norms = points_i.Mul(points_i).Sum({1}); + Tensor query_norms = queries_i.Mul(queries_i).Sum({1}); + + for (int64_t q = 0; q < num_queries_i; q += tile_queries) { + const int64_t num_queries_iter = + std::min(tile_queries, num_queries_i - q); + Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter); + Tensor query_norms_tile = + query_norms.Slice(0, q, q + num_queries_iter); + int64_t* offsets_ptr = + write_offsets.GetDataPtr() + query_begin + q; + + for (int64_t p = 0; p < num_points_i; p += tile_points) { + const int64_t num_points_iter = + std::min(tile_points, num_points_i - p); + Tensor points_tile = points_i.Slice(0, p, p + num_points_iter); + Tensor point_norms_tile = + point_norms.Slice(0, p, p + num_points_iter); + Tensor temp_distances_view = + temp_distances.Slice(0, 0, num_queries_iter) + .Slice(1, 0, num_points_iter); + + AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0, + 0.0); + temp_distances_view.Add_( + point_norms_tile.View({1, num_points_iter})); + temp_distances_view.Add_( + query_norms_tile.View({num_queries_iter, 1})); + GatherWithinThresholdQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, threshold, TIndex(point_begin + p), + offsets_ptr, neighbors_index_ptr, + return_distances ? neighbors_distance_ptr : nullptr); } - const Tensor query = queries.Slice(0, q, q + 1).Flatten(); - GatherQueryResultsWithinRadiusSYCL( - points_i, query, counts[q], TIndex(point_begin), - neighbors_index_ptr + row_splits[q], - return_distances ? neighbors_distance_ptr + row_splits[q] - : nullptr); } } + queue.wait_and_throw(); + neighbors_index = output_allocator.NeighborsIndex(); neighbors_distance = output_allocator.NeighborsDistance(); } +// Hybrid search counts all neighbors within radius while keeping only the best +// max_knn results per query in fixed-size output tensors. template void HybridSearchSYCL(const Tensor& points, const Tensor& queries, @@ -437,68 +769,161 @@ void HybridSearchSYCL(const Tensor& points, const Device device = points.GetDevice(); const int64_t num_queries = queries.GetShape(0); + const Dtype dtype = points.GetDtype(); const T threshold = static_cast(radius * radius); + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); NeighborSearchAllocator output_allocator(device); TIndex* neighbors_index_ptr; T* neighbors_distance_ptr; - output_allocator.AllocIndices(&neighbors_index_ptr, - num_queries * max_knn, TIndex(-1)); + output_allocator.AllocIndices(&neighbors_index_ptr, num_queries * max_knn, + TIndex(-1)); output_allocator.AllocDistances(&neighbors_distance_ptr, - num_queries * max_knn, T(0)); + num_queries * max_knn, + std::numeric_limits::max()); + + Tensor counts = Tensor::Zeros({num_queries}, Int64, device); + Tensor out_indices = + output_allocator.NeighborsIndex().View({num_queries, max_knn}); + Tensor out_distances = + output_allocator.NeighborsDistance().View({num_queries, max_knn}); + + int64_t tile_queries = 0; + int64_t tile_points = 0; + ChooseTileSizeSYCL(num_queries, points.GetShape(0), sizeof(T), tile_queries, + tile_points); + Tensor temp_distances = + Tensor::Empty({tile_queries, tile_points}, dtype, device); + Tensor tile_sort_indices = Tensor::Empty({tile_queries, tile_points}, + Dtype::FromType(), device); + Tensor tile_top_indices = Tensor::Empty({tile_queries, max_knn}, + Dtype::FromType(), device); + Tensor tile_top_distances = + Tensor::Empty({tile_queries, max_knn}, dtype, device); + Tensor merged_indices = Tensor::Empty({tile_queries, max_knn}, + Dtype::FromType(), device); + Tensor merged_distances = + Tensor::Empty({tile_queries, max_knn}, dtype, device); + Tensor merge_sort_indices = Tensor::Empty( + {tile_queries, 2 * max_knn}, Dtype::FromType(), device); - std::vector counts(num_queries, 0); for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1; ++batch_idx) { - const int64_t point_begin = points_row_splits[batch_idx].Item(); - const int64_t point_end = points_row_splits[batch_idx + 1].Item(); + const int64_t point_begin = + points_row_splits[batch_idx].Item(); + const int64_t point_end = + points_row_splits[batch_idx + 1].Item(); const int64_t query_begin = queries_row_splits[batch_idx].Item(); const int64_t query_end = queries_row_splits[batch_idx + 1].Item(); const Tensor points_i = points.Slice(0, point_begin, point_end); - - for (int64_t q = query_begin; q < query_end; ++q) { - const Tensor query = queries.Slice(0, q, q + 1).Flatten(); - const int64_t num_neighbors = - CountQueryResultsWithinRadiusSYCL( - points_i, query, threshold); - counts[q] = static_cast(std::min(num_neighbors, - max_knn)); - if (counts[q] == 0) { - continue; + const Tensor queries_i = queries.Slice(0, query_begin, query_end); + const int64_t num_points_i = points_i.GetShape(0); + const int64_t num_queries_i = queries_i.GetShape(0); + Tensor point_norms = points_i.Mul(points_i).Sum({1}); + Tensor query_norms = queries_i.Mul(queries_i).Sum({1}); + + for (int64_t q = 0; q < num_queries_i; q += tile_queries) { + const int64_t num_queries_iter = + std::min(tile_queries, num_queries_i - q); + Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter); + Tensor query_norms_tile = + query_norms.Slice(0, q, q + num_queries_iter); + Tensor best_indices_view = out_indices.Slice( + 0, query_begin + q, query_begin + q + num_queries_iter); + Tensor best_distances_view = out_distances.Slice( + 0, query_begin + q, query_begin + q + num_queries_iter); + int64_t* counts_ptr = + counts.GetDataPtr() + query_begin + q; + + for (int64_t p = 0; p < num_points_i; p += tile_points) { + const int64_t num_points_iter = + std::min(tile_points, num_points_i - p); + Tensor points_tile = points_i.Slice(0, p, p + num_points_iter); + Tensor point_norms_tile = + point_norms.Slice(0, p, p + num_points_iter); + Tensor temp_distances_view = + temp_distances.Slice(0, 0, num_queries_iter) + .Slice(1, 0, num_points_iter); + + AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0, + 0.0); + temp_distances_view.Add_( + point_norms_tile.View({1, num_points_iter})); + temp_distances_view.Add_( + query_norms_tile.View({num_queries_iter, 1})); + + CountWithinThresholdQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, threshold, counts_ptr); + + Tensor tile_top_indices_view = + tile_top_indices.Slice(0, 0, num_queries_iter); + Tensor tile_top_distances_view = + tile_top_distances.Slice(0, 0, num_queries_iter); + Tensor merged_indices_view = + merged_indices.Slice(0, 0, num_queries_iter); + Tensor merged_distances_view = + merged_distances.Slice(0, 0, num_queries_iter); + + SelectTopKQueriesSYCL( + device, temp_distances_view.GetDataPtr(), + temp_distances_view.GetStride(0), num_queries_iter, + num_points_iter, max_knn, TIndex(point_begin + p), + tile_sort_indices.GetDataPtr(), + tile_sort_indices.GetStride(0), + tile_top_indices_view.GetDataPtr(), + tile_top_distances_view.GetDataPtr(), max_knn, true, + threshold); + MergeTopKQueriesSYCL( + device, best_distances_view.GetDataPtr(), + best_indices_view.GetDataPtr(), max_knn, + tile_top_distances_view.GetDataPtr(), + tile_top_indices_view.GetDataPtr(), max_knn, + num_queries_iter, max_knn, + merge_sort_indices.GetDataPtr(), + merge_sort_indices.GetStride(0), + merged_indices_view.GetDataPtr(), + merged_distances_view.GetDataPtr(), max_knn); + + best_indices_view.AsRvalue() = merged_indices_view; + best_distances_view.AsRvalue() = merged_distances_view; } - GatherQueryResultsWithinRadiusSYCL( - points_i, query, counts[q], TIndex(point_begin), - neighbors_index_ptr + q * max_knn, - neighbors_distance_ptr + q * max_knn); } } neighbors_index = output_allocator.NeighborsIndex(); neighbors_distance = output_allocator.NeighborsDistance(); neighbors_count = - Tensor(counts, {num_queries}, Dtype::FromType()).To(device); + Tensor::Empty({num_queries}, Dtype::FromType(), device); + FinalizeHybridResultsSYCL(device, counts.GetDataPtr(), + num_queries, max_knn, + neighbors_index.GetDataPtr(), + neighbors_distance.GetDataPtr(), + neighbors_count.GetDataPtr()); + queue.wait_and_throw(); } -#define INSTANTIATE(T, TIndex) \ - template void KnnSearchSYCL( \ - const Tensor& points, const Tensor& points_row_splits, \ - const Tensor& queries, const Tensor& queries_row_splits, int knn, \ - Tensor& neighbors_index, Tensor& neighbors_row_splits, \ - Tensor& neighbors_distance); \ - template void FixedRadiusSearchSYCL( \ - const Tensor& points, const Tensor& queries, double radius, \ +#define INSTANTIATE(T, TIndex) \ + template void KnnSearchSYCL( \ + const Tensor& points, const Tensor& points_row_splits, \ + const Tensor& queries, const Tensor& queries_row_splits, int knn, \ + Tensor& neighbors_index, Tensor& neighbors_row_splits, \ + Tensor& neighbors_distance); \ + template void FixedRadiusSearchSYCL( \ + const Tensor& points, const Tensor& queries, double radius, \ const Tensor& points_row_splits, const Tensor& queries_row_splits, \ - const Tensor&, const Tensor&, const Tensor&, const Metric metric, \ - const bool ignore_query_point, const bool return_distances, \ - const bool sort, Tensor& neighbors_index, \ - Tensor& neighbors_row_splits, Tensor& neighbors_distance); \ - template void HybridSearchSYCL( \ - const Tensor& points, const Tensor& queries, double radius, \ - int max_knn, const Tensor& points_row_splits, \ - const Tensor& queries_row_splits, const Tensor&, const Tensor&, \ - const Tensor&, const Metric metric, Tensor& neighbors_index, \ + const Tensor&, const Tensor&, const Tensor&, const Metric metric, \ + const bool ignore_query_point, const bool return_distances, \ + const bool sort, Tensor& neighbors_index, \ + Tensor& neighbors_row_splits, Tensor& neighbors_distance); \ + template void HybridSearchSYCL( \ + const Tensor& points, const Tensor& queries, double radius, \ + int max_knn, const Tensor& points_row_splits, \ + const Tensor& queries_row_splits, const Tensor&, const Tensor&, \ + const Tensor&, const Metric metric, Tensor& neighbors_index, \ Tensor& neighbors_count, Tensor& neighbors_distance); INSTANTIATE(float, int32_t) diff --git a/cpp/tests/core/CMakeLists.txt b/cpp/tests/core/CMakeLists.txt index a1c4826d0fa..1d71b53455c 100644 --- a/cpp/tests/core/CMakeLists.txt +++ b/cpp/tests/core/CMakeLists.txt @@ -22,10 +22,15 @@ target_sources(tests PRIVATE SYCLUtils.cpp ) +if (BUILD_CUDA_MODULE OR BUILD_SYCL_MODULE) + target_sources(tests PRIVATE + KnnIndex.cpp + ) +endif() + if (BUILD_CUDA_MODULE) target_sources(tests PRIVATE FixedRadiusIndex.cpp - KnnIndex.cpp ParallelFor.cu ) endif() @@ -38,10 +43,8 @@ if (BUILD_ISPC_MODULE) endif() # TODO: cmake does not currently build this test! -# if (BUILD_SYCL_MODULE) -# target_sources(tests PRIVATE -# ParallelForSYCL.cpp -# ) -# set_source_files_properties(ParallelForSYCL.cpp PROPERTIES -# COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64_gen") -# endif() \ No newline at end of file +#if (BUILD_SYCL_MODULE) +# open3d_sycl_target_sources(tests PRIVATE +# ParallelForSYCL.cpp +# ) +#endif() \ No newline at end of file diff --git a/cpp/tests/core/KnnIndex.cpp b/cpp/tests/core/KnnIndex.cpp index 1ceb32d4835..fdeb8aa9c52 100644 --- a/cpp/tests/core/KnnIndex.cpp +++ b/cpp/tests/core/KnnIndex.cpp @@ -24,9 +24,17 @@ using namespace std; namespace open3d { namespace tests { -TEST(KnnIndex, KnnSearch) { +class KnnIndexPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P(KnnIndex, + KnnIndexPermuteDevices, + testing::ValuesIn(KnnIndexPermuteDevices::TestCases())); + +TEST_P(KnnIndexPermuteDevices, KnnSearch) { // Define test data. - core::Device device = core::Device("CUDA:0"); + core::Device device = GetParam(); + if (device.IsCPU()) { + GTEST_SKIP() << "KnnIndex does not support CPU device."; + } core::Tensor dataset_points = core::Tensor::Init({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.1}, {0.0, 0.0, 0.2}, @@ -149,9 +157,12 @@ TEST(KnnIndex, KnnSearch) { EXPECT_TRUE(distances.AllClose(gt_distances)); } -TEST(KnnIndex, KnnSearchHighdim) { +TEST_P(KnnIndexPermuteDevices, KnnSearchHighdim) { // Define test data. - core::Device device = core::Device("CUDA:0"); + core::Device device = GetParam(); + if (device.IsCPU()) { + GTEST_SKIP() << "KnnIndex does not support CPU device."; + } core::Tensor dataset_points = core::Tensor::Init({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.1}, {0.0, 0.0, 0.2}, @@ -231,9 +242,12 @@ TEST(KnnIndex, KnnSearchHighdim) { EXPECT_TRUE(distances64.AllClose(gt_distances)); } -TEST(KnnIndex, KnnSearchBatch) { +TEST_P(KnnIndexPermuteDevices, KnnSearchBatch) { // Define test data. - core::Device device = core::Device("CUDA:0"); + core::Device device = GetParam(); + if (device.IsCPU()) { + GTEST_SKIP() << "KnnIndex does not support CPU device."; + } core::Tensor dataset_points = core::Tensor::Init( {{0.719, 0.128, 0.431}, {0.764, 0.970, 0.678}, {0.692, 0.786, 0.211}, {0.692, 0.969, 0.942}, @@ -303,8 +317,8 @@ TEST(KnnIndex, KnnSearchBatch) { index32.SearchKnn(query_points, queries_row_splits, 3); EXPECT_EQ(indices.GetShape(), shape); EXPECT_EQ(distances.GetShape(), shape); - EXPECT_TRUE(indices.AllClose(gt_indices)); - EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)); + EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString(); + EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) << distances.ToString(); // int64 // Set up Knn index. @@ -327,8 +341,8 @@ TEST(KnnIndex, KnnSearchBatch) { index64.SearchKnn(query_points, queries_row_splits, 3); EXPECT_EQ(indices.GetShape(), shape); EXPECT_EQ(distances.GetShape(), shape); - EXPECT_TRUE(indices.AllClose(gt_indices)); - EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)); + EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString(); + EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) << distances.ToString(); } } // namespace tests From 058c09f7a6a4e39b7a70ecfbe33714d6d62ca418 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Mar 2026 06:33:45 +0000 Subject: [PATCH 13/14] Apply automatic code formatting --- cpp/open3d/core/nns/KnnIndex.h | 8 ++++---- cpp/open3d/core/nns/NearestNeighborSearch.cpp | 13 ++++++------- cpp/tests/core/KnnIndex.cpp | 13 ++++++++----- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/cpp/open3d/core/nns/KnnIndex.h b/cpp/open3d/core/nns/KnnIndex.h index 6628c92850c..4ac3b7193f8 100644 --- a/cpp/open3d/core/nns/KnnIndex.h +++ b/cpp/open3d/core/nns/KnnIndex.h @@ -47,10 +47,10 @@ class KnnIndex : public NNSIndex { /// \brief Parameterized Constructor. /// - /// \param dataset_points Provides a set of data points as Tensor for nearest - /// neighbor search. CPU tensors use NanoFlann through - /// open3d::core::nns::NearestNeighborSearch, while CUDA and SYCL tensors are - /// handled by this class. + /// \param dataset_points Provides a set of data points as Tensor for + /// nearest neighbor search. CPU tensors use NanoFlann through + /// open3d::core::nns::NearestNeighborSearch, while CUDA and SYCL tensors + /// are handled by this class. KnnIndex(const Tensor& dataset_points); KnnIndex(const Tensor& dataset_points, const Dtype& index_dtype); ~KnnIndex(); diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.cpp b/cpp/open3d/core/nns/NearestNeighborSearch.cpp index 920971beaed..f7889f737bb 100644 --- a/cpp/open3d/core/nns/NearestNeighborSearch.cpp +++ b/cpp/open3d/core/nns/NearestNeighborSearch.cpp @@ -52,12 +52,12 @@ bool NearestNeighborSearch::FixedRadiusIndex(std::optional radius) { } else { if (dataset_points_.IsSYCL()) { if (!radius.has_value()) { - utility::LogError("radius is required for SYCL FixedRadiusIndex."); + utility::LogError( + "radius is required for SYCL FixedRadiusIndex."); } fixed_radius_index_.reset(new nns::FixedRadiusIndex()); - return fixed_radius_index_->SetTensorData(dataset_points_, - radius.value(), - index_dtype_); + return fixed_radius_index_->SetTensorData( + dataset_points_, radius.value(), index_dtype_); } return SetIndex(); } @@ -83,9 +83,8 @@ bool NearestNeighborSearch::HybridIndex(std::optional radius) { utility::LogError("radius is required for SYCL HybridIndex."); } fixed_radius_index_.reset(new nns::FixedRadiusIndex()); - return fixed_radius_index_->SetTensorData(dataset_points_, - radius.value(), - index_dtype_); + return fixed_radius_index_->SetTensorData( + dataset_points_, radius.value(), index_dtype_); } return SetIndex(); } diff --git a/cpp/tests/core/KnnIndex.cpp b/cpp/tests/core/KnnIndex.cpp index fdeb8aa9c52..a5d63beebd1 100644 --- a/cpp/tests/core/KnnIndex.cpp +++ b/cpp/tests/core/KnnIndex.cpp @@ -25,9 +25,10 @@ namespace open3d { namespace tests { class KnnIndexPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(KnnIndex, - KnnIndexPermuteDevices, - testing::ValuesIn(KnnIndexPermuteDevices::TestCases())); +INSTANTIATE_TEST_SUITE_P( + KnnIndex, + KnnIndexPermuteDevices, + testing::ValuesIn(KnnIndexPermuteDevices::TestCases())); TEST_P(KnnIndexPermuteDevices, KnnSearch) { // Define test data. @@ -318,7 +319,8 @@ TEST_P(KnnIndexPermuteDevices, KnnSearchBatch) { EXPECT_EQ(indices.GetShape(), shape); EXPECT_EQ(distances.GetShape(), shape); EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString(); - EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) << distances.ToString(); + EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) + << distances.ToString(); // int64 // Set up Knn index. @@ -342,7 +344,8 @@ TEST_P(KnnIndexPermuteDevices, KnnSearchBatch) { EXPECT_EQ(indices.GetShape(), shape); EXPECT_EQ(distances.GetShape(), shape); EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString(); - EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) << distances.ToString(); + EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) + << distances.ToString(); } } // namespace tests From 283039b52a0d8e954e4cc87c807b55da23b667a0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 23:27:54 +0000 Subject: [PATCH 14/14] Add SYCL support for Transform kernel: TransformPoints, TransformNormals, RotatePoints, RotateNormals Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com> --- cpp/open3d/t/geometry/kernel/CMakeLists.txt | 6 + cpp/open3d/t/geometry/kernel/Transform.cpp | 24 ++++ cpp/open3d/t/geometry/kernel/Transform.h | 14 +++ cpp/open3d/t/geometry/kernel/TransformImpl.h | 4 + .../t/geometry/kernel/TransformSYCL.cpp | 104 ++++++++++++++++++ 5 files changed, 152 insertions(+) create mode 100644 cpp/open3d/t/geometry/kernel/TransformSYCL.cpp diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt index 223c4d4c7a8..560745b6b5d 100644 --- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt +++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt @@ -30,6 +30,12 @@ if (BUILD_CUDA_MODULE) ) endif() +if (BUILD_SYCL_MODULE) + open3d_sycl_target_sources(tgeometry_kernel PRIVATE + TransformSYCL.cpp + ) +endif() + if (WITH_IPP) target_sources(tgeometry_kernel PRIVATE IPPImage.cpp diff --git a/cpp/open3d/t/geometry/kernel/Transform.cpp b/cpp/open3d/t/geometry/kernel/Transform.cpp index 06ec1de454f..185be29bfc2 100644 --- a/cpp/open3d/t/geometry/kernel/Transform.cpp +++ b/cpp/open3d/t/geometry/kernel/Transform.cpp @@ -31,6 +31,12 @@ void TransformPoints(const core::Tensor& transformation, core::Tensor& points) { } else if (points.IsCUDA()) { CUDA_CALL(TransformPointsCUDA, transformation_contiguous, points_contiguous); + } else if (points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TransformPointsSYCL(transformation_contiguous, points_contiguous); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device"); } @@ -53,6 +59,12 @@ void TransformNormals(const core::Tensor& transformation, } else if (normals.IsCUDA()) { CUDA_CALL(TransformNormalsCUDA, transformation_contiguous, normals_contiguous); + } else if (normals.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TransformNormalsSYCL(transformation_contiguous, normals_contiguous); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device"); } @@ -78,6 +90,12 @@ void RotatePoints(const core::Tensor& R, } else if (points.IsCUDA()) { CUDA_CALL(RotatePointsCUDA, R_contiguous, points_contiguous, center_contiguous); + } else if (points.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + RotatePointsSYCL(R_contiguous, points_contiguous, center_contiguous); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device"); } @@ -97,6 +115,12 @@ void RotateNormals(const core::Tensor& R, core::Tensor& normals) { RotateNormalsCPU(R_contiguous, normals_contiguous); } else if (normals.IsCUDA()) { CUDA_CALL(RotateNormalsCUDA, R_contiguous, normals_contiguous); + } else if (normals.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + RotateNormalsSYCL(R_contiguous, normals_contiguous); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else { utility::LogError("Unimplemented device"); } diff --git a/cpp/open3d/t/geometry/kernel/Transform.h b/cpp/open3d/t/geometry/kernel/Transform.h index d64d860572e..e51ebae73c4 100644 --- a/cpp/open3d/t/geometry/kernel/Transform.h +++ b/cpp/open3d/t/geometry/kernel/Transform.h @@ -52,6 +52,20 @@ void RotatePointsCUDA(const core::Tensor& R, void RotateNormalsCUDA(const core::Tensor& R, core::Tensor& normals); #endif +#ifdef BUILD_SYCL_MODULE +void TransformPointsSYCL(const core::Tensor& transformation, + core::Tensor& points); + +void TransformNormalsSYCL(const core::Tensor& transformation, + core::Tensor& normals); + +void RotatePointsSYCL(const core::Tensor& R, + core::Tensor& points, + const core::Tensor& center); + +void RotateNormalsSYCL(const core::Tensor& R, core::Tensor& normals); +#endif + } // namespace transform } // namespace kernel } // namespace geometry diff --git a/cpp/open3d/t/geometry/kernel/TransformImpl.h b/cpp/open3d/t/geometry/kernel/TransformImpl.h index 14c08a01e52..cedcc22fd87 100644 --- a/cpp/open3d/t/geometry/kernel/TransformImpl.h +++ b/cpp/open3d/t/geometry/kernel/TransformImpl.h @@ -87,6 +87,8 @@ OPEN3D_HOST_DEVICE OPEN3D_FORCE_INLINE void RotateNormalsKernel( normals_ptr[2] = x[2]; } +#ifndef OPEN3D_SKIP_TRANSFORM_MAIN + #ifdef __CUDACC__ void TransformPointsCUDA #else @@ -167,6 +169,8 @@ void RotateNormalsCPU }); } +#endif // OPEN3D_SKIP_TRANSFORM_MAIN + } // namespace transform } // namespace kernel } // namespace geometry diff --git a/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp b/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp new file mode 100644 index 00000000000..9bfd9078e3a --- /dev/null +++ b/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp @@ -0,0 +1,104 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +// Skip the CPU/CUDA main function definitions; only use the helper templates. +#define OPEN3D_SKIP_TRANSFORM_MAIN +#include "open3d/t/geometry/kernel/TransformImpl.h" +#undef OPEN3D_SKIP_TRANSFORM_MAIN + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" + +namespace open3d { +namespace t { +namespace geometry { +namespace kernel { +namespace transform { + +void TransformPointsSYCL(const core::Tensor& transformation, + core::Tensor& points) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue( + points.GetDevice()); + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(points.GetDtype(), [&]() { + scalar_t* points_ptr = points.GetDataPtr(); + const scalar_t* transformation_ptr = + transformation.GetDataPtr(); + const int64_t n = points.GetLength(); + queue.parallel_for(sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + TransformPointsKernel( + transformation_ptr, + points_ptr + 3 * id[0]); + }) + .wait_and_throw(); + }); +} + +void TransformNormalsSYCL(const core::Tensor& transformation, + core::Tensor& normals) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue( + normals.GetDevice()); + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(normals.GetDtype(), [&]() { + scalar_t* normals_ptr = normals.GetDataPtr(); + const scalar_t* transformation_ptr = + transformation.GetDataPtr(); + const int64_t n = normals.GetLength(); + queue.parallel_for(sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + TransformNormalsKernel( + transformation_ptr, + normals_ptr + 3 * id[0]); + }) + .wait_and_throw(); + }); +} + +void RotatePointsSYCL(const core::Tensor& R, + core::Tensor& points, + const core::Tensor& center) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue( + points.GetDevice()); + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(points.GetDtype(), [&]() { + scalar_t* points_ptr = points.GetDataPtr(); + const scalar_t* R_ptr = R.GetDataPtr(); + const scalar_t* center_ptr = center.GetDataPtr(); + const int64_t n = points.GetLength(); + queue.parallel_for(sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + RotatePointsKernel(R_ptr, + points_ptr + 3 * id[0], + center_ptr); + }) + .wait_and_throw(); + }); +} + +void RotateNormalsSYCL(const core::Tensor& R, core::Tensor& normals) { + sycl::queue queue = + core::sy::SYCLContext::GetInstance().GetDefaultQueue( + normals.GetDevice()); + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(normals.GetDtype(), [&]() { + scalar_t* normals_ptr = normals.GetDataPtr(); + const scalar_t* R_ptr = R.GetDataPtr(); + const int64_t n = normals.GetLength(); + queue.parallel_for( + sycl::range<1>{(size_t)n}, + [=](sycl::id<1> id) { + RotateNormalsKernel(R_ptr, normals_ptr + 3 * id[0]); + }) + .wait_and_throw(); + }); +} + +} // namespace transform +} // namespace kernel +} // namespace geometry +} // namespace t +} // namespace open3d