From f965404b4f0f0b0b3535c141e02ff78c4117b14b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 20 Feb 2026 07:49:39 +0000
Subject: [PATCH 01/14] Initial plan


From a5f1fa3dc756e1ab2b15234c0a5beb7caa5428b7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 20 Feb 2026 13:01:06 +0000
Subject: [PATCH 02/14] Add SYCL kernels for t/pipelines/kernel: Feature,
 FillInLinearSystem, Registration, RGBDOdometry, TransformationConverter

Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com>
---
 cpp/open3d/t/pipelines/kernel/CMakeLists.txt  |  10 +
 cpp/open3d/t/pipelines/kernel/Feature.cpp     |  11 +-
 cpp/open3d/t/pipelines/kernel/Feature.h       |  13 +
 cpp/open3d/t/pipelines/kernel/FeatureImpl.h   |   2 +
 cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp | 228 +++++++++
 .../t/pipelines/kernel/FillInLinearSystem.cpp |  26 +
 .../t/pipelines/kernel/FillInLinearSystem.h   |  41 ++
 .../pipelines/kernel/FillInLinearSystemImpl.h |   2 +
 .../kernel/FillInLinearSystemSYCL.cpp         | 456 +++++++++++++++++
 .../t/pipelines/kernel/RGBDOdometry.cpp       |  39 ++
 .../t/pipelines/kernel/RGBDOdometryImpl.h     |  55 +++
 .../t/pipelines/kernel/RGBDOdometrySYCL.cpp   | 366 ++++++++++++++
 .../t/pipelines/kernel/Registration.cpp       |  91 ++++
 .../t/pipelines/kernel/RegistrationImpl.h     |  57 +++
 .../t/pipelines/kernel/RegistrationSYCL.cpp   | 462 ++++++++++++++++++
 .../kernel/TransformationConverter.cpp        |  14 +
 .../kernel/TransformationConverterImpl.h      |  20 +
 .../kernel/TransformationConverterSYCL.cpp    |  56 +++
 18 files changed, 1948 insertions(+), 1 deletion(-)
 create mode 100644 cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp
 create mode 100644 cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
 create mode 100644 cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
 create mode 100644 cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
 create mode 100644 cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp

diff --git a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
index a766715e6e7..2a9478b4be9 100644
--- a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
@@ -22,6 +22,16 @@ if (BUILD_CUDA_MODULE)
     )
 endif()
 
+if (BUILD_SYCL_MODULE)
+    target_sources(tpipelines_kernel PRIVATE
+        RegistrationSYCL.cpp
+        FillInLinearSystemSYCL.cpp
+        RGBDOdometrySYCL.cpp
+        TransformationConverterSYCL.cpp
+        FeatureSYCL.cpp
+    )
+endif()
+
 open3d_show_and_abort_on_warning(tpipelines_kernel)
 open3d_set_global_properties(tpipelines_kernel)
 # The kernels are used in the unit tests, so they cannot be hidden for now.
diff --git a/cpp/open3d/t/pipelines/kernel/Feature.cpp b/cpp/open3d/t/pipelines/kernel/Feature.cpp
index c73573054aa..4677396f1d7 100644
--- a/cpp/open3d/t/pipelines/kernel/Feature.cpp
+++ b/cpp/open3d/t/pipelines/kernel/Feature.cpp
@@ -44,10 +44,19 @@ void ComputeFPFHFeature(
     if (points_d.IsCPU()) {
         ComputeFPFHFeatureCPU(points_d, normals_d, indices, distance2, counts_d,
                               fpfhs, mask, map_info_idx_to_point_idx);
-    } else {
+    } else if (points_d.IsCUDA()) {
         core::CUDAScopedDevice scoped_device(points.GetDevice());
         CUDA_CALL(ComputeFPFHFeatureCUDA, points_d, normals_d, indices,
                   distance2, counts_d, fpfhs, mask, map_info_idx_to_point_idx);
+    } else if (points_d.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeFPFHFeatureSYCL(points_d, normals_d, indices, distance2,
+                               counts_d, fpfhs, mask, map_info_idx_to_point_idx);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
+    } else {
+        utility::LogError("Unimplemented device.");
     }
     utility::LogDebug(
             "[ComputeFPFHFeature] Computed {:d} features from "
diff --git a/cpp/open3d/t/pipelines/kernel/Feature.h b/cpp/open3d/t/pipelines/kernel/Feature.h
index 6645c9bb6cc..cc1d66f6920 100644
--- a/cpp/open3d/t/pipelines/kernel/Feature.h
+++ b/cpp/open3d/t/pipelines/kernel/Feature.h
@@ -49,6 +49,19 @@ void ComputeFPFHFeatureCUDA(
                 std::nullopt);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void ComputeFPFHFeatureSYCL(
+        const core::Tensor &points,
+        const core::Tensor &normals,
+        const core::Tensor &indices,
+        const core::Tensor &distance2,
+        const core::Tensor &counts,
+        core::Tensor &fpfhs,
+        const std::optional<core::Tensor> &mask = std::nullopt,
+        const std::optional<core::Tensor> &map_batch_info_idx_to_point_idx =
+                std::nullopt);
+#endif
+
 }  // namespace kernel
 }  // namespace pipelines
 }  // namespace t
diff --git a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
index c9948b6a62d..5caef81ff49 100644
--- a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
@@ -104,6 +104,7 @@ OPEN3D_HOST_DEVICE void UpdateSPFHFeature(const scalar_t *feature,
     spfh[idx * 33 + h_index3 + 22] += hist_incr;
 }
 
+#ifndef OPEN3D_SKIP_FPFH_MAIN
 #if defined(__CUDACC__)
 void ComputeFPFHFeatureCUDA
 #else
@@ -291,6 +292,7 @@ void ComputeFPFHFeatureCPU
                 });
     });
 }
+#endif  // OPEN3D_SKIP_FPFH_MAIN
 
 }  // namespace kernel
 }  // namespace pipelines
diff --git a/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp
new file mode 100644
index 00000000000..7338edc63f8
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp
@@ -0,0 +1,228 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+// Skip the CPU/CUDA main function definitions; only use the helper templates.
+#define OPEN3D_SKIP_FPFH_MAIN
+#include "open3d/t/pipelines/kernel/FeatureImpl.h"
+#undef OPEN3D_SKIP_FPFH_MAIN
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+
+void ComputeFPFHFeatureSYCL(
+        const core::Tensor &points,
+        const core::Tensor &normals,
+        const core::Tensor &indices,
+        const core::Tensor &distance2,
+        const core::Tensor &counts,
+        core::Tensor &fpfhs,
+        const std::optional<core::Tensor> &mask,
+        const std::optional<core::Tensor> &map_info_idx_to_point_idx) {
+    const core::Dtype dtype = points.GetDtype();
+    const core::Device device = points.GetDevice();
+    const int64_t n_points = points.GetLength();
+
+    const bool filter_fpfh =
+            mask.has_value() && map_info_idx_to_point_idx.has_value();
+    if (mask.has_value() ^ map_info_idx_to_point_idx.has_value()) {
+        utility::LogError(
+                "Parameters mask and map_info_idx_to_point_idx must "
+                "either be both provided or both not provided.");
+    }
+    if (filter_fpfh) {
+        if (mask.value().GetShape()[0] != n_points) {
+            utility::LogError(
+                    "Parameter mask was provided, but its size {:d} should"
+                    "be equal to the number of points {:d}.",
+                    (int)mask.value().GetShape()[0], n_points);
+        }
+        if (map_info_idx_to_point_idx.value().GetShape()[0] !=
+            counts.GetShape()[0] - (indices.GetShape().size() == 1 ? 1 : 0)) {
+            utility::LogError(
+                    "Parameter map_info_idx_to_point_idx was provided, "
+                    "but its size"
+                    "{:d} should be equal to the size of counts {:d}.",
+                    (int)map_info_idx_to_point_idx.value().GetShape()[0],
+                    (int)counts.GetShape()[0]);
+        }
+    }
+
+    core::Tensor map_spfh_info_idx_to_point_idx =
+            map_info_idx_to_point_idx.value_or(
+                    core::Tensor::Empty({0}, core::Int64, device));
+
+    const core::Tensor map_fpfh_idx_to_point_idx =
+            filter_fpfh ? mask.value().NonZero().GetItem(
+                                  {core::TensorKey::Index(0)})
+                        : core::Tensor::Empty({0}, core::Int64, device);
+
+    const int32_t n_fpfh =
+            filter_fpfh ? map_fpfh_idx_to_point_idx.GetLength() : n_points;
+    const int32_t n_spfh =
+            filter_fpfh ? map_spfh_info_idx_to_point_idx.GetLength() : n_points;
+
+    core::Tensor spfhs =
+            core::Tensor::Zeros({n_spfh, 33}, dtype, fpfhs.GetDevice());
+
+    core::Tensor map_point_idx_to_spfh_idx;
+    if (filter_fpfh) {
+        map_point_idx_to_spfh_idx = core::Tensor::Full(
+                {n_points}, -1, core::Int64, fpfhs.GetDevice());
+        map_point_idx_to_spfh_idx.IndexSet(
+                {map_spfh_info_idx_to_point_idx},
+                core::Tensor::Arange(0, n_spfh, 1, core::Int64,
+                                     fpfhs.GetDevice()));
+    } else {
+        map_point_idx_to_spfh_idx =
+                core::Tensor::Empty({0}, core::Int64, fpfhs.GetDevice());
+    }
+
+    // Check the nns type (knn = hybrid = false, radius = true).
+    // The nns radius search mode will resulting a prefix sum 1D tensor.
+    bool is_radius_search;
+    int nn_size = 0;
+    if (indices.GetShape().size() == 1) {
+        is_radius_search = true;
+    } else {
+        is_radius_search = false;
+        nn_size = indices.GetShape()[1];
+    }
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        const scalar_t *points_ptr = points.GetDataPtr<scalar_t>();
+        const scalar_t *normals_ptr = normals.GetDataPtr<scalar_t>();
+        const int32_t *indices_ptr = indices.GetDataPtr<int32_t>();
+        const scalar_t *distance2_ptr = distance2.GetDataPtr<scalar_t>();
+        const int32_t *counts_ptr = counts.GetDataPtr<int32_t>();
+        scalar_t *spfhs_ptr = spfhs.GetDataPtr<scalar_t>();
+        scalar_t *fpfhs_ptr = fpfhs.GetDataPtr<scalar_t>();
+        const int64_t *map_spfh_info_idx_to_point_idx_ptr =
+                map_spfh_info_idx_to_point_idx.GetDataPtr<int64_t>();
+        const int64_t *map_fpfh_idx_to_point_idx_ptr =
+                map_fpfh_idx_to_point_idx.GetDataPtr<int64_t>();
+        const int64_t *map_point_idx_to_spfh_idx_ptr =
+                map_point_idx_to_spfh_idx.GetDataPtr<int64_t>();
+
+        // Compute SPFH features for the points.
+        queue.parallel_for(
+                     sycl::range<1>{(size_t)n_spfh},
+                     [=](sycl::id<1> id) {
+                         int64_t workload_idx = id[0];
+                         int64_t workload_point_idx =
+                                 filter_fpfh
+                                         ? map_spfh_info_idx_to_point_idx_ptr
+                                                   [workload_idx]
+                                         : workload_idx;
+                         int64_t idx = 3 * workload_point_idx;
+                         const scalar_t *point = points_ptr + idx;
+                         const scalar_t *normal = normals_ptr + idx;
+
+                         const int indice_size =
+                                 is_radius_search
+                                         ? (counts_ptr[workload_idx + 1] -
+                                            counts_ptr[workload_idx])
+                                         : counts_ptr[workload_idx];
+
+                         if (indice_size > 1) {
+                             const scalar_t hist_incr =
+                                     100.0 / static_cast<scalar_t>(
+                                                     indice_size - 1);
+                             for (int i = 1; i < indice_size; i++) {
+                                 const int point_idx =
+                                         is_radius_search
+                                                 ? indices_ptr
+                                                           [i +
+                                                            counts_ptr
+                                                                    [workload_idx]]
+                                                 : indices_ptr[workload_idx *
+                                                                       nn_size +
+                                                               i];
+
+                                 const scalar_t *point_ref =
+                                         points_ptr + 3 * point_idx;
+                                 const scalar_t *normal_ref =
+                                         normals_ptr + 3 * point_idx;
+                                 scalar_t fea[4] = {0};
+                                 ComputePairFeature<scalar_t>(
+                                         point, normal, point_ref, normal_ref,
+                                         fea);
+                                 UpdateSPFHFeature<scalar_t>(
+                                         fea, workload_idx, hist_incr,
+                                         spfhs_ptr);
+                             }
+                         }
+                     })
+                .wait_and_throw();
+
+        // Compute FPFH features for the points.
+        queue.parallel_for(
+                     sycl::range<1>{(size_t)n_fpfh},
+                     [=](sycl::id<1> id) {
+                         int64_t workload_idx = id[0];
+                         int64_t workload_spfh_idx =
+                                 filter_fpfh
+                                         ? map_point_idx_to_spfh_idx_ptr
+                                                   [map_fpfh_idx_to_point_idx_ptr
+                                                            [workload_idx]]
+                                         : workload_idx;
+                         const int indice_size =
+                                 is_radius_search
+                                         ? (counts_ptr[workload_spfh_idx + 1] -
+                                            counts_ptr[workload_spfh_idx])
+                                         : counts_ptr[workload_spfh_idx];
+                         if (indice_size > 1) {
+                             scalar_t sum[3] = {0.0, 0.0, 0.0};
+                             for (int i = 1; i < indice_size; i++) {
+                                 const int idx =
+                                         is_radius_search
+                                                 ? i +
+                                                           counts_ptr
+                                                                   [workload_spfh_idx]
+                                                 : workload_spfh_idx * nn_size +
+                                                           i;
+                                 const scalar_t dist = distance2_ptr[idx];
+                                 if (dist == 0.0) continue;
+                                 const int32_t spfh_idx =
+                                         filter_fpfh
+                                                 ? map_point_idx_to_spfh_idx_ptr
+                                                           [indices_ptr[idx]]
+                                                 : indices_ptr[idx];
+                                 for (int j = 0; j < 33; j++) {
+                                     const scalar_t val =
+                                             spfhs_ptr[spfh_idx * 33 + j] /
+                                             dist;
+                                     sum[j / 11] += val;
+                                     fpfhs_ptr[workload_idx * 33 + j] += val;
+                                 }
+                             }
+                             for (int j = 0; j < 3; j++) {
+                                 sum[j] = sum[j] != 0.0 ? 100.0 / sum[j] : 0.0;
+                             }
+                             for (int j = 0; j < 33; j++) {
+                                 fpfhs_ptr[workload_idx * 33 + j] *=
+                                         sum[j / 11];
+                                 fpfhs_ptr[workload_idx * 33 + j] +=
+                                         spfhs_ptr[workload_spfh_idx * 33 + j];
+                             }
+                         }
+                     })
+                .wait_and_throw();
+    });
+}
+
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
index 9d5e825da9a..82a2c33ac93 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
@@ -60,6 +60,13 @@ void FillInRigidAlignmentTerm(core::Tensor &AtA,
 
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (AtA.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        FillInRigidAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs,
+                                     Ri_normal_ps, i, j, threshold);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device");
@@ -124,6 +131,16 @@ void FillInSLACAlignmentTerm(core::Tensor &AtA,
 
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (AtA.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        FillInSLACAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs,
+                                    normal_ps, Ri_normal_ps,
+                                    RjT_Ri_normal_ps, cgrid_idx_ps,
+                                    cgrid_idx_qs, cgrid_ratio_ps,
+                                    cgrid_ratio_qs, i, j, n, threshold);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device");
@@ -163,6 +180,15 @@ void FillInSLACRegularizerTerm(core::Tensor &AtA,
                 positions_init, positions_curr, weight, n, anchor_idx);
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (AtA.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        FillInSLACRegularizerTermSYCL(AtA, Atb, residual, grid_idx,
+                                      grid_nbs_idx, grid_nbs_mask,
+                                      positions_init, positions_curr, weight, n,
+                                      anchor_idx);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device");
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
index 1c529813841..91014b84dd2 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
@@ -130,6 +130,47 @@ void FillInSLACRegularizerTermCUDA(core::Tensor &AtA,
                                    int n,
                                    int anchor_idx);
 
+#endif
+#ifdef BUILD_SYCL_MODULE
+void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
+                                   core::Tensor &Atb,
+                                   core::Tensor &residual,
+                                   const core::Tensor &Ti_qs,
+                                   const core::Tensor &Tj_qs,
+                                   const core::Tensor &Ri_normal_ps,
+                                   int i,
+                                   int j,
+                                   float threshold);
+
+void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
+                                  core::Tensor &Atb,
+                                  core::Tensor &residual,
+                                  const core::Tensor &Ti_qs,
+                                  const core::Tensor &Tj_qs,
+                                  const core::Tensor &normal_ps,
+                                  const core::Tensor &Ri_normal_ps,
+                                  const core::Tensor &RjT_Ri_normal_ps,
+                                  const core::Tensor &cgrid_idx_ps,
+                                  const core::Tensor &cgrid_idx_qs,
+                                  const core::Tensor &cgrid_ratio_qs,
+                                  const core::Tensor &cgrid_ratio_ps,
+                                  int i,
+                                  int j,
+                                  int n,
+                                  float threshold);
+
+void FillInSLACRegularizerTermSYCL(core::Tensor &AtA,
+                                    core::Tensor &Atb,
+                                    core::Tensor &residual,
+                                    const core::Tensor &grid_idx,
+                                    const core::Tensor &grid_nbs_idx,
+                                    const core::Tensor &grid_nbs_mask,
+                                    const core::Tensor &positions_init,
+                                    const core::Tensor &positions_curr,
+                                    float weight,
+                                    int n,
+                                    int anchor_idx);
+
 #endif
 
 }  // namespace kernel
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
index e158809f6a4..21732102836 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
@@ -13,6 +13,7 @@ namespace open3d {
 namespace t {
 namespace pipelines {
 namespace kernel {
+#ifndef OPEN3D_SKIP_FILL_IN_LS_MAIN
 #if defined(__CUDACC__)
 void FillInRigidAlignmentTermCUDA
 #else
@@ -471,6 +472,7 @@ void FillInSLACRegularizerTermCPU
                 }
             });
 }
+#endif  // OPEN3D_SKIP_FILL_IN_LS_MAIN
 }  // namespace kernel
 }  // namespace pipelines
 }  // namespace t
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
new file mode 100644
index 00000000000..e99b7f5ccb6
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
@@ -0,0 +1,456 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+// Skip the CPU/CUDA main function definitions; include only helper includes.
+#define OPEN3D_SKIP_FILL_IN_LS_MAIN
+#include "open3d/t/pipelines/kernel/FillInLinearSystemImpl.h"
+#undef OPEN3D_SKIP_FILL_IN_LS_MAIN
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/t/pipelines/kernel/FillInLinearSystem.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+
+void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
+                                   core::Tensor &Atb,
+                                   core::Tensor &residual,
+                                   const core::Tensor &Ti_ps,
+                                   const core::Tensor &Tj_qs,
+                                   const core::Tensor &Ri_normal_ps,
+                                   int i,
+                                   int j,
+                                   float threshold) {
+    core::Device device = AtA.GetDevice();
+    int64_t n = Ti_ps.GetLength();
+    if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
+        utility::LogError(
+                "Unable to setup linear system: input length mismatch.");
+    }
+
+    // First fill in a small 12 x 12 linear system.
+    core::Tensor AtA_local =
+            core::Tensor::Zeros({12, 12}, core::Float32, device);
+    core::Tensor Atb_local = core::Tensor::Zeros({12}, core::Float32, device);
+
+    float *AtA_local_ptr = static_cast<float *>(AtA_local.GetDataPtr());
+    float *Atb_local_ptr = static_cast<float *>(Atb_local.GetDataPtr());
+    float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
+
+    const float *Ti_ps_ptr = static_cast<const float *>(Ti_ps.GetDataPtr());
+    const float *Tj_qs_ptr = static_cast<const float *>(Tj_qs.GetDataPtr());
+    const float *Ri_normal_ps_ptr =
+            static_cast<const float *>(Ri_normal_ps.GetDataPtr());
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(
+                 sycl::range<1>{(size_t)n},
+                 [=](sycl::id<1> id) {
+                     int64_t workload_idx = id[0];
+                     const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
+                     const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
+                     const float *normal_p_prime =
+                             Ri_normal_ps_ptr + 3 * workload_idx;
+
+                     float r = (p_prime[0] - q_prime[0]) *
+                                       normal_p_prime[0] +
+                               (p_prime[1] - q_prime[1]) *
+                                       normal_p_prime[1] +
+                               (p_prime[2] - q_prime[2]) * normal_p_prime[2];
+                     if (sycl::fabs(r) > threshold) return;
+
+                     float J_ij[12];
+                     J_ij[0] = -q_prime[2] * normal_p_prime[1] +
+                               q_prime[1] * normal_p_prime[2];
+                     J_ij[1] = q_prime[2] * normal_p_prime[0] -
+                               q_prime[0] * normal_p_prime[2];
+                     J_ij[2] = -q_prime[1] * normal_p_prime[0] +
+                               q_prime[0] * normal_p_prime[1];
+                     J_ij[3] = normal_p_prime[0];
+                     J_ij[4] = normal_p_prime[1];
+                     J_ij[5] = normal_p_prime[2];
+                     for (int k = 0; k < 6; ++k) {
+                         J_ij[k + 6] = -J_ij[k];
+                     }
+
+                     for (int i_local = 0; i_local < 12; ++i_local) {
+                         for (int j_local = 0; j_local < 12; ++j_local) {
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     AtA_local_ptr[i_local * 12 + j_local]) +=
+                                     J_ij[i_local] * J_ij[j_local];
+                         }
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 Atb_local_ptr[i_local]) +=
+                                 J_ij[i_local] * r;
+                     }
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             *residual_ptr) += r * r;
+                 })
+            .wait_and_throw();
+
+    // Then fill-in the large linear system.
+    std::vector<int64_t> indices_vec(12);
+    for (int k = 0; k < 6; ++k) {
+        indices_vec[k] = i * 6 + k;
+        indices_vec[k + 6] = j * 6 + k;
+    }
+
+    std::vector<int64_t> indices_i_vec;
+    std::vector<int64_t> indices_j_vec;
+    for (int local_i = 0; local_i < 12; ++local_i) {
+        for (int local_j = 0; local_j < 12; ++local_j) {
+            indices_i_vec.push_back(indices_vec[local_i]);
+            indices_j_vec.push_back(indices_vec[local_j]);
+        }
+    }
+
+    core::Tensor indices(indices_vec, {12}, core::Int64, device);
+    core::Tensor indices_i(indices_i_vec, {12 * 12}, core::Int64, device);
+    core::Tensor indices_j(indices_j_vec, {12 * 12}, core::Int64, device);
+
+    core::Tensor AtA_sub = AtA.IndexGet({indices_i, indices_j});
+    AtA.IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.View({12 * 12}));
+
+    core::Tensor Atb_sub = Atb.IndexGet({indices});
+    Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1}));
+}
+
+void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
+                                  core::Tensor &Atb,
+                                  core::Tensor &residual,
+                                  const core::Tensor &Ti_Cps,
+                                  const core::Tensor &Tj_Cqs,
+                                  const core::Tensor &Cnormal_ps,
+                                  const core::Tensor &Ri_Cnormal_ps,
+                                  const core::Tensor &RjT_Ri_Cnormal_ps,
+                                  const core::Tensor &cgrid_idx_ps,
+                                  const core::Tensor &cgrid_idx_qs,
+                                  const core::Tensor &cgrid_ratio_qs,
+                                  const core::Tensor &cgrid_ratio_ps,
+                                  int i,
+                                  int j,
+                                  int n_frags,
+                                  float threshold) {
+    int64_t n = Ti_Cps.GetLength();
+    if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
+        Ri_Cnormal_ps.GetLength() != n ||
+        RjT_Ri_Cnormal_ps.GetLength() != n ||
+        cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
+        cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
+        utility::LogError(
+                "Unable to setup linear system: input length mismatch.");
+    }
+
+    core::Device device = AtA.GetDevice();
+    int n_vars = Atb.GetLength();
+    float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
+    float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
+    float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
+
+    const float *Ti_Cps_ptr =
+            static_cast<const float *>(Ti_Cps.GetDataPtr());
+    const float *Tj_Cqs_ptr =
+            static_cast<const float *>(Tj_Cqs.GetDataPtr());
+    const float *Cnormal_ps_ptr =
+            static_cast<const float *>(Cnormal_ps.GetDataPtr());
+    const float *Ri_Cnormal_ps_ptr =
+            static_cast<const float *>(Ri_Cnormal_ps.GetDataPtr());
+    const float *RjT_Ri_Cnormal_ps_ptr =
+            static_cast<const float *>(RjT_Ri_Cnormal_ps.GetDataPtr());
+
+    const int *cgrid_idx_ps_ptr =
+            static_cast<const int *>(cgrid_idx_ps.GetDataPtr());
+    const int *cgrid_idx_qs_ptr =
+            static_cast<const int *>(cgrid_idx_qs.GetDataPtr());
+    const float *cgrid_ratio_ps_ptr =
+            static_cast<const float *>(cgrid_ratio_ps.GetDataPtr());
+    const float *cgrid_ratio_qs_ptr =
+            static_cast<const float *>(cgrid_ratio_qs.GetDataPtr());
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(
+                 sycl::range<1>{(size_t)n},
+                 [=](sycl::id<1> id) {
+                     int64_t workload_idx = id[0];
+                     const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
+                     const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
+                     const float *Cnormal_p =
+                             Cnormal_ps_ptr + 3 * workload_idx;
+                     const float *Ri_Cnormal_p =
+                             Ri_Cnormal_ps_ptr + 3 * workload_idx;
+                     const float *RjTRi_Cnormal_p =
+                             RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
+
+                     const int *cgrid_idx_p =
+                             cgrid_idx_ps_ptr + 8 * workload_idx;
+                     const int *cgrid_idx_q =
+                             cgrid_idx_qs_ptr + 8 * workload_idx;
+                     const float *cgrid_ratio_p =
+                             cgrid_ratio_ps_ptr + 8 * workload_idx;
+                     const float *cgrid_ratio_q =
+                             cgrid_ratio_qs_ptr + 8 * workload_idx;
+
+                     float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
+                               (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
+                               (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
+                     if (sycl::fabs(r) > threshold) return;
+
+                     float J[60];
+                     int idx[60];
+
+                     J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] +
+                            Tj_Cq[1] * Ri_Cnormal_p[2];
+                     J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] -
+                            Tj_Cq[0] * Ri_Cnormal_p[2];
+                     J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] +
+                            Tj_Cq[0] * Ri_Cnormal_p[1];
+                     J[3] = Ri_Cnormal_p[0];
+                     J[4] = Ri_Cnormal_p[1];
+                     J[5] = Ri_Cnormal_p[2];
+
+                     for (int k = 0; k < 6; ++k) {
+                         J[k + 6] = -J[k];
+                         idx[k + 0] = 6 * i + k;
+                         idx[k + 6] = 6 * j + k;
+                     }
+
+                     for (int k = 0; k < 8; ++k) {
+                         J[12 + k * 3 + 0] =
+                                 cgrid_ratio_p[k] * Cnormal_p[0];
+                         J[12 + k * 3 + 1] =
+                                 cgrid_ratio_p[k] * Cnormal_p[1];
+                         J[12 + k * 3 + 2] =
+                                 cgrid_ratio_p[k] * Cnormal_p[2];
+                         idx[12 + k * 3 + 0] =
+                                 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
+                         idx[12 + k * 3 + 1] =
+                                 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
+                         idx[12 + k * 3 + 2] =
+                                 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
+                     }
+
+                     for (int k = 0; k < 8; ++k) {
+                         J[36 + k * 3 + 0] =
+                                 -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
+                         J[36 + k * 3 + 1] =
+                                 -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
+                         J[36 + k * 3 + 2] =
+                                 -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
+                         idx[36 + k * 3 + 0] =
+                                 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
+                         idx[36 + k * 3 + 1] =
+                                 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
+                         idx[36 + k * 3 + 2] =
+                                 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
+                     }
+
+                     for (int ki = 0; ki < 60; ++ki) {
+                         for (int kj = 0; kj < 60; ++kj) {
+                             float AtA_ij = J[ki] * J[kj];
+                             int ij = idx[ki] * n_vars + idx[kj];
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     AtA_ptr[ij]) += AtA_ij;
+                         }
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 Atb_ptr[idx[ki]]) += J[ki] * r;
+                     }
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             *residual_ptr) += r * r;
+                 })
+            .wait_and_throw();
+}
+
+void FillInSLACRegularizerTermSYCL(core::Tensor &AtA,
+                                    core::Tensor &Atb,
+                                    core::Tensor &residual,
+                                    const core::Tensor &grid_idx,
+                                    const core::Tensor &grid_nbs_idx,
+                                    const core::Tensor &grid_nbs_mask,
+                                    const core::Tensor &positions_init,
+                                    const core::Tensor &positions_curr,
+                                    float weight,
+                                    int n_frags,
+                                    int anchor_idx) {
+    int64_t n = grid_idx.GetLength();
+    int64_t n_vars = Atb.GetLength();
+
+    core::Device device = AtA.GetDevice();
+    float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
+    float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
+    float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
+
+    const int *grid_idx_ptr =
+            static_cast<const int *>(grid_idx.GetDataPtr());
+    const int *grid_nbs_idx_ptr =
+            static_cast<const int *>(grid_nbs_idx.GetDataPtr());
+    const bool *grid_nbs_mask_ptr =
+            static_cast<const bool *>(grid_nbs_mask.GetDataPtr());
+    const float *positions_init_ptr =
+            static_cast<const float *>(positions_init.GetDataPtr());
+    const float *positions_curr_ptr =
+            static_cast<const float *>(positions_curr.GetDataPtr());
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(
+                 sycl::range<1>{(size_t)n},
+                 [=](sycl::id<1> id) {
+                     int64_t workload_idx = id[0];
+                     int idx_i = grid_idx_ptr[workload_idx];
+
+                     const int *idx_nbs =
+                             grid_nbs_idx_ptr + 6 * workload_idx;
+                     const bool *mask_nbs =
+                             grid_nbs_mask_ptr + 6 * workload_idx;
+
+                     float cov[3][3] = {{0}};
+                     float U[3][3], V[3][3], S[3];
+
+                     int cnt = 0;
+                     for (int k = 0; k < 6; ++k) {
+                         if (!mask_nbs[k]) continue;
+                         int idx_k = idx_nbs[k];
+
+                         float diff_ik_init[3] = {
+                                 positions_init_ptr[idx_i * 3 + 0] -
+                                         positions_init_ptr[idx_k * 3 + 0],
+                                 positions_init_ptr[idx_i * 3 + 1] -
+                                         positions_init_ptr[idx_k * 3 + 1],
+                                 positions_init_ptr[idx_i * 3 + 2] -
+                                         positions_init_ptr[idx_k * 3 + 2]};
+                         float diff_ik_curr[3] = {
+                                 positions_curr_ptr[idx_i * 3 + 0] -
+                                         positions_curr_ptr[idx_k * 3 + 0],
+                                 positions_curr_ptr[idx_i * 3 + 1] -
+                                         positions_curr_ptr[idx_k * 3 + 1],
+                                 positions_curr_ptr[idx_i * 3 + 2] -
+                                         positions_curr_ptr[idx_k * 3 + 2]};
+
+                         for (int ii = 0; ii < 3; ++ii) {
+                             for (int jj = 0; jj < 3; ++jj) {
+                                 cov[ii][jj] += diff_ik_init[ii] *
+                                                diff_ik_curr[jj];
+                             }
+                         }
+                         ++cnt;
+                     }
+
+                     if (cnt < 3) return;
+
+                     core::linalg::kernel::svd3x3(*cov, *U, S, *V);
+
+                     float R[3][3];
+                     core::linalg::kernel::transpose3x3_(*U);
+                     core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
+
+                     float d = core::linalg::kernel::det3x3(*R);
+                     if (d < 0) {
+                         U[2][0] = -U[2][0];
+                         U[2][1] = -U[2][1];
+                         U[2][2] = -U[2][2];
+                         core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
+                     }
+
+                     if (idx_i == anchor_idx) {
+                         R[0][0] = R[1][1] = R[2][2] = 1;
+                         R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] =
+                                 R[2][1] = 0;
+                     }
+
+                     for (int k = 0; k < 6; ++k) {
+                         if (!mask_nbs[k]) continue;
+                         int idx_k = idx_nbs[k];
+
+                         float diff_ik_init[3] = {
+                                 positions_init_ptr[idx_i * 3 + 0] -
+                                         positions_init_ptr[idx_k * 3 + 0],
+                                 positions_init_ptr[idx_i * 3 + 1] -
+                                         positions_init_ptr[idx_k * 3 + 1],
+                                 positions_init_ptr[idx_i * 3 + 2] -
+                                         positions_init_ptr[idx_k * 3 + 2]};
+                         float diff_ik_curr[3] = {
+                                 positions_curr_ptr[idx_i * 3 + 0] -
+                                         positions_curr_ptr[idx_k * 3 + 0],
+                                 positions_curr_ptr[idx_i * 3 + 1] -
+                                         positions_curr_ptr[idx_k * 3 + 1],
+                                 positions_curr_ptr[idx_i * 3 + 2] -
+                                         positions_curr_ptr[idx_k * 3 + 2]};
+                         float R_diff_ik_curr[3];
+                         core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init,
+                                                              R_diff_ik_curr);
+
+                         float local_r[3];
+                         local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
+                         local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
+                         local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
+
+                         int offset_idx_i = 3 * idx_i + 6 * n_frags;
+                         int offset_idx_k = 3 * idx_k + 6 * n_frags;
+
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 *residual_ptr) +=
+                                 weight * (local_r[0] * local_r[0] +
+                                           local_r[1] * local_r[1] +
+                                           local_r[2] * local_r[2]);
+
+                         for (int axis = 0; axis < 3; ++axis) {
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     AtA_ptr[(offset_idx_i + axis) * n_vars +
+                                             offset_idx_i + axis]) += weight;
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     AtA_ptr[(offset_idx_k + axis) * n_vars +
+                                             offset_idx_k + axis]) += weight;
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     AtA_ptr[(offset_idx_i + axis) * n_vars +
+                                             offset_idx_k + axis]) -= weight;
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     AtA_ptr[(offset_idx_k + axis) * n_vars +
+                                             offset_idx_i + axis]) -= weight;
+
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     Atb_ptr[offset_idx_i + axis]) +=
+                                     weight * local_r[axis];
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     Atb_ptr[offset_idx_k + axis]) -=
+                                     weight * local_r[axis];
+                         }
+                     }
+                 })
+            .wait_and_throw();
+}
+
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
index ac61daeaa2c..6b54fb41961 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
@@ -58,6 +58,15 @@ void ComputeOdometryResultPointToPlane(
                   target_vertex_map, target_normal_map, intrinsics_d, trans_d,
                   delta, inlier_residual, inlier_count, depth_outlier_trunc,
                   depth_huber_delta);
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeOdometryResultPointToPlaneSYCL(
+                source_vertex_map, target_vertex_map, target_normal_map,
+                intrinsics_d, trans_d, delta, inlier_residual, inlier_count,
+                depth_outlier_trunc, depth_huber_delta);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -118,6 +127,16 @@ void ComputeOdometryResultIntensity(const core::Tensor &source_depth,
                   target_intensity_dx, target_intensity_dy, source_vertex_map,
                   intrinsics_d, trans_d, delta, inlier_residual, inlier_count,
                   depth_outlier_trunc, intensity_huber_delta);
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeOdometryResultIntensitySYCL(
+                source_depth, target_depth, source_intensity, target_intensity,
+                target_intensity_dx, target_intensity_dy, source_vertex_map,
+                intrinsics_d, trans_d, delta, inlier_residual, inlier_count,
+                depth_outlier_trunc, intensity_huber_delta);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -187,6 +206,17 @@ void ComputeOdometryResultHybrid(const core::Tensor &source_depth,
                   source_vertex_map, intrinsics_d, trans_d, delta,
                   inlier_residual, inlier_count, depth_outlier_trunc,
                   depth_huber_delta, intensity_huber_delta);
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeOdometryResultHybridSYCL(
+                source_depth, target_depth, source_intensity, target_intensity,
+                target_depth_dx, target_depth_dy, target_intensity_dx,
+                target_intensity_dy, source_vertex_map, intrinsics_d, trans_d,
+                delta, inlier_residual, inlier_count, depth_outlier_trunc,
+                depth_huber_delta, intensity_huber_delta);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -222,6 +252,15 @@ void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map,
         CUDA_CALL(ComputeOdometryInformationMatrixCUDA, source_vertex_map,
                   target_vertex_map, intrinsic, source_to_target,
                   square_dist_thr, information);
+    } else if (device.GetType() == core::Device::DeviceType::SYCL) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeOdometryInformationMatrixSYCL(source_vertex_map,
+                                             target_vertex_map, intrinsic,
+                                             source_to_target, square_dist_thr,
+                                             information);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
index d1aacbbeac3..ff854a406e2 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
@@ -125,6 +125,61 @@ void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_depth,
                                           core::Tensor& information);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void ComputeOdometryResultPointToPlaneSYCL(
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& target_vertex_map,
+        const core::Tensor& source_normal_map,
+        const core::Tensor& intrinsics,
+        const core::Tensor& init_source_to_target,
+        core::Tensor& delta,
+        float& inlier_residual,
+        int& inlier_count,
+        const float depth_outlier_trunc,
+        const float depth_huber_delta);
+
+void ComputeOdometryResultIntensitySYCL(
+        const core::Tensor& source_depth,
+        const core::Tensor& target_depth,
+        const core::Tensor& source_intensity,
+        const core::Tensor& target_intensity,
+        const core::Tensor& target_intensity_dx,
+        const core::Tensor& target_intensity_dy,
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& intrinsics,
+        const core::Tensor& init_source_to_target,
+        core::Tensor& delta,
+        float& inlier_residual,
+        int& inlier_count,
+        const float depth_outlier_trunc,
+        const float intensity_huber_delta);
+
+void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth,
+                                      const core::Tensor& target_depth,
+                                      const core::Tensor& source_intensity,
+                                      const core::Tensor& target_intensity,
+                                      const core::Tensor& target_depth_dx,
+                                      const core::Tensor& target_depth_dy,
+                                      const core::Tensor& target_intensity_dx,
+                                      const core::Tensor& target_intensity_dy,
+                                      const core::Tensor& source_vertex_map,
+                                      const core::Tensor& intrinsics,
+                                      const core::Tensor& init_source_to_target,
+                                      core::Tensor& delta,
+                                      float& inlier_residual,
+                                      int& inlier_count,
+                                      const float depth_outlier_trunc,
+                                      const float depth_huber_delta,
+                                      const float intensity_huber_delta);
+
+void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_depth,
+                                           const core::Tensor& target_depth,
+                                           const core::Tensor& intrinsic,
+                                           const core::Tensor& source_to_target,
+                                           const float square_dist_thr,
+                                           core::Tensor& information);
+#endif
+
 }  // namespace odometry
 }  // namespace kernel
 }  // namespace pipelines
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
new file mode 100644
index 00000000000..f48468763f1
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
@@ -0,0 +1,366 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/t/geometry/kernel/GeometryIndexer.h"
+#include "open3d/t/geometry/kernel/GeometryMacros.h"
+#include "open3d/t/pipelines/kernel/RGBDOdometryImpl.h"
+#include "open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h"
+#include "open3d/t/pipelines/kernel/TransformationConverter.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+namespace odometry {
+
+using t::geometry::kernel::NDArrayIndexer;
+using t::geometry::kernel::TransformIndexer;
+
+static constexpr int kReduceDimOdometry = 29;
+static constexpr int kJtJDimOdometry = 21;
+
+void ComputeOdometryResultPointToPlaneSYCL(
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& target_vertex_map,
+        const core::Tensor& target_normal_map,
+        const core::Tensor& intrinsics,
+        const core::Tensor& init_source_to_target,
+        core::Tensor& delta,
+        float& inlier_residual,
+        int& inlier_count,
+        const float depth_outlier_trunc,
+        const float depth_huber_delta) {
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+    NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
+    NDArrayIndexer target_normal_indexer(target_normal_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = init_source_to_target;
+    TransformIndexer ti(intrinsics, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+    const int64_t n = rows * cols;
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
+    float* global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(
+                 sycl::range<1>{(size_t)n},
+                 [=](sycl::id<1> id) {
+                     const int workload_idx = id[0];
+                     const int y = workload_idx / cols;
+                     const int x = workload_idx % cols;
+
+                     float J[6] = {0};
+                     float r = 0;
+                     const bool valid = GetJacobianPointToPlane(
+                             x, y, depth_outlier_trunc, source_vertex_indexer,
+                             target_vertex_indexer, target_normal_indexer, ti,
+                             J, r);
+
+                     if (valid) {
+                         const float d_huber =
+                                 HuberDeriv(r, depth_huber_delta);
+                         const float r_huber =
+                                 HuberLoss(r, depth_huber_delta);
+
+                         int offset = 0;
+                         for (int i = 0; i < 6; ++i) {
+                             for (int j = 0; j <= i; ++j) {
+                                 sycl::atomic_ref<
+                                         float, sycl::memory_order::acq_rel,
+                                         sycl::memory_scope::device>(
+                                         global_sum_ptr[offset++]) +=
+                                         J[i] * J[j];
+                             }
+                         }
+                         for (int i = 0; i < 6; ++i) {
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     global_sum_ptr[21 + i]) +=
+                                     J[i] * d_huber;
+                         }
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 global_sum_ptr[27]) += r_huber;
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 global_sum_ptr[28]) += 1.0f;
+                     }
+                 })
+            .wait_and_throw();
+
+    DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
+}
+
+void ComputeOdometryResultIntensitySYCL(
+        const core::Tensor& source_depth,
+        const core::Tensor& target_depth,
+        const core::Tensor& source_intensity,
+        const core::Tensor& target_intensity,
+        const core::Tensor& target_intensity_dx,
+        const core::Tensor& target_intensity_dy,
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& intrinsics,
+        const core::Tensor& init_source_to_target,
+        core::Tensor& delta,
+        float& inlier_residual,
+        int& inlier_count,
+        const float depth_outlier_trunc,
+        const float intensity_huber_delta) {
+    NDArrayIndexer source_depth_indexer(source_depth, 2);
+    NDArrayIndexer target_depth_indexer(target_depth, 2);
+    NDArrayIndexer source_intensity_indexer(source_intensity, 2);
+    NDArrayIndexer target_intensity_indexer(target_intensity, 2);
+    NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2);
+    NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2);
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = init_source_to_target;
+    TransformIndexer ti(intrinsics, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+    const int64_t n = rows * cols;
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
+    float* global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(
+                 sycl::range<1>{(size_t)n},
+                 [=](sycl::id<1> id) {
+                     const int workload_idx = id[0];
+                     const int y = workload_idx / cols;
+                     const int x = workload_idx % cols;
+
+                     float J_I[6] = {0};
+                     float r_I = 0;
+                     const bool valid = GetJacobianIntensity(
+                             x, y, depth_outlier_trunc, source_depth_indexer,
+                             target_depth_indexer, source_intensity_indexer,
+                             target_intensity_indexer,
+                             target_intensity_dx_indexer,
+                             target_intensity_dy_indexer, source_vertex_indexer,
+                             ti, J_I, r_I);
+
+                     if (valid) {
+                         const float d_huber =
+                                 HuberDeriv(r_I, intensity_huber_delta);
+                         const float r_huber =
+                                 HuberLoss(r_I, intensity_huber_delta);
+
+                         int offset = 0;
+                         for (int i = 0; i < 6; ++i) {
+                             for (int j = 0; j <= i; ++j) {
+                                 sycl::atomic_ref<
+                                         float, sycl::memory_order::acq_rel,
+                                         sycl::memory_scope::device>(
+                                         global_sum_ptr[offset++]) +=
+                                         J_I[i] * J_I[j];
+                             }
+                         }
+                         for (int i = 0; i < 6; ++i) {
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     global_sum_ptr[21 + i]) +=
+                                     J_I[i] * d_huber;
+                         }
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 global_sum_ptr[27]) += r_huber;
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 global_sum_ptr[28]) += 1.0f;
+                     }
+                 })
+            .wait_and_throw();
+
+    DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
+}
+
+void ComputeOdometryResultHybridSYCL(
+        const core::Tensor& source_depth,
+        const core::Tensor& target_depth,
+        const core::Tensor& source_intensity,
+        const core::Tensor& target_intensity,
+        const core::Tensor& target_depth_dx,
+        const core::Tensor& target_depth_dy,
+        const core::Tensor& target_intensity_dx,
+        const core::Tensor& target_intensity_dy,
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& intrinsics,
+        const core::Tensor& init_source_to_target,
+        core::Tensor& delta,
+        float& inlier_residual,
+        int& inlier_count,
+        const float depth_outlier_trunc,
+        const float depth_huber_delta,
+        const float intensity_huber_delta) {
+    NDArrayIndexer source_depth_indexer(source_depth, 2);
+    NDArrayIndexer target_depth_indexer(target_depth, 2);
+    NDArrayIndexer source_intensity_indexer(source_intensity, 2);
+    NDArrayIndexer target_intensity_indexer(target_intensity, 2);
+    NDArrayIndexer target_depth_dx_indexer(target_depth_dx, 2);
+    NDArrayIndexer target_depth_dy_indexer(target_depth_dy, 2);
+    NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2);
+    NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2);
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = init_source_to_target;
+    TransformIndexer ti(intrinsics, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+    const int64_t n = rows * cols;
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
+    float* global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(
+                 sycl::range<1>{(size_t)n},
+                 [=](sycl::id<1> id) {
+                     const int workload_idx = id[0];
+                     const int y = workload_idx / cols;
+                     const int x = workload_idx % cols;
+
+                     float J_I[6] = {0}, J_D[6] = {0};
+                     float r_I = 0, r_D = 0;
+                     const bool valid = GetJacobianHybrid(
+                             x, y, depth_outlier_trunc, source_depth_indexer,
+                             target_depth_indexer, source_intensity_indexer,
+                             target_intensity_indexer, target_depth_dx_indexer,
+                             target_depth_dy_indexer,
+                             target_intensity_dx_indexer,
+                             target_intensity_dy_indexer, source_vertex_indexer,
+                             ti, J_I, J_D, r_I, r_D);
+
+                     if (valid) {
+                         int offset = 0;
+                         for (int i = 0; i < 6; ++i) {
+                             for (int j = 0; j <= i; ++j) {
+                                 sycl::atomic_ref<
+                                         float, sycl::memory_order::acq_rel,
+                                         sycl::memory_scope::device>(
+                                         global_sum_ptr[offset++]) +=
+                                         J_I[i] * J_I[j] + J_D[i] * J_D[j];
+                             }
+                         }
+                         for (int i = 0; i < 6; ++i) {
+                             sycl::atomic_ref<
+                                     float, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device>(
+                                     global_sum_ptr[21 + i]) +=
+                                     J_I[i] *
+                                             HuberDeriv(r_I,
+                                                        intensity_huber_delta) +
+                                     J_D[i] * HuberDeriv(r_D, depth_huber_delta);
+                         }
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 global_sum_ptr[27]) +=
+                                 HuberLoss(r_I, intensity_huber_delta) +
+                                 HuberLoss(r_D, depth_huber_delta);
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 global_sum_ptr[28]) += 1.0f;
+                     }
+                 })
+            .wait_and_throw();
+
+    DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
+}
+
+void ComputeOdometryInformationMatrixSYCL(
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& target_vertex_map,
+        const core::Tensor& intrinsic,
+        const core::Tensor& source_to_target,
+        const float square_dist_thr,
+        core::Tensor& information) {
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+    NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = source_to_target;
+    TransformIndexer ti(intrinsic, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+    const int64_t n = rows * cols;
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kJtJDimOdometry}, core::Float32, device);
+    float* global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(
+                 sycl::range<1>{(size_t)n},
+                 [=](sycl::id<1> id) {
+                     const int workload_idx = id[0];
+                     const int y = workload_idx / cols;
+                     const int x = workload_idx % cols;
+
+                     float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
+                     float rx = 0, ry = 0, rz = 0;
+                     const bool valid = GetJacobianPointToPoint(
+                             x, y, square_dist_thr, source_vertex_indexer,
+                             target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry,
+                             rz);
+
+                     if (valid) {
+                         int offset = 0;
+                         for (int i = 0; i < 6; ++i) {
+                             for (int j = 0; j <= i; ++j) {
+                                 sycl::atomic_ref<
+                                         float, sycl::memory_order::acq_rel,
+                                         sycl::memory_scope::device>(
+                                         global_sum_ptr[offset++]) +=
+                                         J_x[i] * J_x[j] + J_y[i] * J_y[j] +
+                                         J_z[i] * J_z[j];
+                             }
+                         }
+                     }
+                 })
+            .wait_and_throw();
+
+    const core::Device host(core::Device("CPU:0"));
+    information = core::Tensor::Empty({6, 6}, core::Float64, host);
+    global_sum = global_sum.To(host, core::Float64);
+
+    double* info_ptr = information.GetDataPtr<double>();
+    double* reduction_ptr = global_sum.GetDataPtr<double>();
+    for (int j = 0; j < 6; j++) {
+        const int64_t reduction_idx = ((j * (j + 1)) / 2);
+        for (int k = 0; k <= j; k++) {
+            info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k];
+            info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k];
+        }
+    }
+}
+
+}  // namespace odometry
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/Registration.cpp b/cpp/open3d/t/pipelines/kernel/Registration.cpp
index 326671431f3..b3ebf4f5508 100644
--- a/cpp/open3d/t/pipelines/kernel/Registration.cpp
+++ b/cpp/open3d/t/pipelines/kernel/Registration.cpp
@@ -41,6 +41,16 @@ core::Tensor ComputePosePointToPlane(const core::Tensor &source_points,
                   target_points.Contiguous(), target_normals.Contiguous(),
                   correspondence_indices.Contiguous(), pose, residual,
                   inlier_count, source_points.GetDtype(), device, kernel);
+    } else if (source_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputePosePointToPlaneSYCL(
+                source_points.Contiguous(), target_points.Contiguous(),
+                target_normals.Contiguous(),
+                correspondence_indices.Contiguous(), pose, residual,
+                inlier_count, source_points.GetDtype(), device, kernel);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -85,6 +95,18 @@ core::Tensor ComputePoseColoredICP(const core::Tensor &source_points,
                   correspondence_indices.Contiguous(), pose, residual,
                   inlier_count, source_points.GetDtype(), device, kernel,
                   lambda_geometric);
+    } else if (source_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputePoseColoredICPSYCL(
+                source_points.Contiguous(), source_colors.Contiguous(),
+                target_points.Contiguous(), target_normals.Contiguous(),
+                target_colors.Contiguous(), target_color_gradients.Contiguous(),
+                correspondence_indices.Contiguous(), pose, residual,
+                inlier_count, source_points.GetDtype(), device, kernel,
+                lambda_geometric);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -191,6 +213,21 @@ core::Tensor ComputePoseDopplerICP(
                   v_v_in_V.Contiguous(), period, reject_outliers,
                   doppler_outlier_threshold, kernel_geometric, kernel_doppler,
                   lambda_doppler);
+    } else if (device_type == core::Device::DeviceType::SYCL) {
+#ifdef BUILD_SYCL_MODULE
+        ComputePoseDopplerICPSYCL(
+                source_points.Contiguous(), source_dopplers.Contiguous(),
+                source_directions.Contiguous(), target_points.Contiguous(),
+                target_normals.Contiguous(),
+                correspondence_indices.Contiguous(), output_pose, residual,
+                inlier_count, dtype, device, R_S_to_V.Contiguous(),
+                r_v_to_s_in_V.Contiguous(), w_v_in_V.Contiguous(),
+                v_v_in_V.Contiguous(), period, reject_outliers,
+                doppler_outlier_threshold, kernel_geometric, kernel_doppler,
+                lambda_doppler);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -270,6 +307,51 @@ std::tuple<core::Tensor, core::Tensor> ComputeRtPointToPoint(
         t = mean_t.Reshape({-1}) - R.Matmul(mean_s.T()).Reshape({-1});
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (source_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        // Use tensor operations which work on SYCL devices.
+        core::Tensor valid = correspondence_indices.Ne(-1).Reshape({-1});
+
+        if (valid.GetLength() == 0) {
+            utility::LogError("No valid correspondence present.");
+        }
+
+        core::Tensor source_indices =
+                core::Tensor::Arange(0, source_points.GetShape()[0], 1,
+                                     core::Int64, device)
+                        .IndexGet({valid});
+        core::Tensor target_indices =
+                correspondence_indices.IndexGet({valid}).Reshape({-1});
+
+        inlier_count = source_indices.GetLength();
+
+        core::Tensor source_select = source_points.IndexGet({source_indices});
+        core::Tensor target_select = target_points.IndexGet({target_indices});
+
+        core::Tensor mean_s = source_select.Mean({0}, true);
+        core::Tensor mean_t = target_select.Mean({0}, true);
+
+        core::Device host("CPU:0");
+        core::Tensor Sxy = (target_select - mean_t)
+                                   .T()
+                                   .Matmul(source_select - mean_s)
+                                   .Div_(static_cast<float>(inlier_count))
+                                   .To(host, core::Float64);
+
+        mean_s = mean_s.To(host, core::Float64);
+        mean_t = mean_t.To(host, core::Float64);
+
+        core::Tensor U, D, VT;
+        std::tie(U, D, VT) = Sxy.SVD();
+        core::Tensor S = core::Tensor::Eye(3, core::Float64, host);
+        if (U.Det() * (VT.T()).Det() < 0) {
+            S[-1][-1] = -1;
+        }
+        R = U.Matmul(S.Matmul(VT));
+        t = mean_t.Reshape({-1}) - R.Matmul(mean_s.T()).Reshape({-1});
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device.");
@@ -294,6 +376,15 @@ core::Tensor ComputeInformationMatrix(
         CUDA_CALL(ComputeInformationMatrixCUDA, target_points.Contiguous(),
                   correspondence_indices.Contiguous(), information_matrix,
                   target_points.GetDtype(), device);
+    } else if (target_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeInformationMatrixSYCL(target_points.Contiguous(),
+                                     correspondence_indices.Contiguous(),
+                                     information_matrix,
+                                     target_points.GetDtype(), device);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
index 3f08ca4c811..f3b65414f88 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
@@ -149,6 +149,63 @@ void ComputeInformationMatrixCUDA(const core::Tensor &target_points,
                                   const core::Device &device);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
+                                  const core::Tensor &target_points,
+                                  const core::Tensor &target_normals,
+                                  const core::Tensor &correspondence_indices,
+                                  core::Tensor &pose,
+                                  float &residual,
+                                  int &inlier_count,
+                                  const core::Dtype &dtype,
+                                  const core::Device &device,
+                                  const registration::RobustKernel &kernel);
+
+void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
+                                const core::Tensor &source_colors,
+                                const core::Tensor &target_points,
+                                const core::Tensor &target_normals,
+                                const core::Tensor &target_colors,
+                                const core::Tensor &target_color_gradients,
+                                const core::Tensor &correspondence_indices,
+                                core::Tensor &pose,
+                                float &residual,
+                                int &inlier_count,
+                                const core::Dtype &dtype,
+                                const core::Device &device,
+                                const registration::RobustKernel &kernel,
+                                const double &lambda_geometric);
+
+void ComputePoseDopplerICPSYCL(
+        const core::Tensor &source_points,
+        const core::Tensor &source_dopplers,
+        const core::Tensor &source_directions,
+        const core::Tensor &target_points,
+        const core::Tensor &target_normals,
+        const core::Tensor &correspondence_indices,
+        core::Tensor &output_pose,
+        float &residual,
+        int &inlier_count,
+        const core::Dtype &dtype,
+        const core::Device &device,
+        const core::Tensor &R_S_to_V,
+        const core::Tensor &r_v_to_s_in_V,
+        const core::Tensor &w_v_in_V,
+        const core::Tensor &v_v_in_V,
+        const double period,
+        const bool reject_dynamic_outliers,
+        const double doppler_outlier_threshold,
+        const registration::RobustKernel &kernel_geometric,
+        const registration::RobustKernel &kernel_doppler,
+        const double lambda_doppler);
+
+void ComputeInformationMatrixSYCL(const core::Tensor &target_points,
+                                   const core::Tensor &correspondence_indices,
+                                   core::Tensor &information_matrix,
+                                   const core::Dtype &dtype,
+                                   const core::Device &device);
+#endif
+
 template <typename scalar_t>
 OPEN3D_HOST_DEVICE inline bool GetJacobianPointToPlane(
         int64_t workload_idx,
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
new file mode 100644
index 00000000000..17bf9768f93
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
@@ -0,0 +1,462 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/t/pipelines/kernel/RegistrationImpl.h"
+#include "open3d/t/pipelines/kernel/TransformationConverter.h"
+#include "open3d/t/pipelines/registration/RobustKernel.h"
+#include "open3d/t/pipelines/registration/RobustKernelImpl.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+
+static constexpr int kReduceDim = 29;  // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier)
+
+void ComputePosePointToPlaneSYCL(
+        const core::Tensor &source_points,
+        const core::Tensor &target_points,
+        const core::Tensor &target_normals,
+        const core::Tensor &correspondence_indices,
+        core::Tensor &pose,
+        float &residual,
+        int &inlier_count,
+        const core::Dtype &dtype,
+        const core::Device &device,
+        const registration::RobustKernel &kernel) {
+    const int n = source_points.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
+
+        DISPATCH_ROBUST_KERNEL_FUNCTION(
+                kernel.type_, scalar_t, kernel.scaling_parameter_,
+                kernel.shape_parameter_, [&]() {
+                    const scalar_t *source_points_ptr =
+                            source_points.GetDataPtr<scalar_t>();
+                    const scalar_t *target_points_ptr =
+                            target_points.GetDataPtr<scalar_t>();
+                    const scalar_t *target_normals_ptr =
+                            target_normals.GetDataPtr<scalar_t>();
+                    const int64_t *correspondence_indices_ptr =
+                            correspondence_indices.GetDataPtr<int64_t>();
+
+                    queue.parallel_for(
+                                 sycl::range<1>{(size_t)n},
+                                 [=](sycl::id<1> id) {
+                                     const int workload_idx = id[0];
+                                     scalar_t J_ij[6] = {0};
+                                     scalar_t r = 0;
+                                     const bool valid =
+                                             GetJacobianPointToPlane<scalar_t>(
+                                                     workload_idx,
+                                                     source_points_ptr,
+                                                     target_points_ptr,
+                                                     target_normals_ptr,
+                                                     correspondence_indices_ptr,
+                                                     J_ij, r);
+
+                                     if (valid) {
+                                         const scalar_t w =
+                                                 GetWeightFromRobustKernel(r);
+                                         int i = 0;
+                                         for (int j = 0; j < 6; ++j) {
+                                             for (int k = 0; k <= j; ++k) {
+                                                 sycl::atomic_ref<
+                                                         scalar_t,
+                                                         sycl::memory_order::
+                                                                 acq_rel,
+                                                         sycl::memory_scope::
+                                                                 device>(
+                                                         global_sum_ptr[i]) +=
+                                                         J_ij[j] * w * J_ij[k];
+                                                 ++i;
+                                             }
+                                             sycl::atomic_ref<
+                                                     scalar_t,
+                                                     sycl::memory_order::
+                                                             acq_rel,
+                                                     sycl::memory_scope::
+                                                             device>(
+                                                     global_sum_ptr[21 + j]) +=
+                                                     J_ij[j] * w * r;
+                                         }
+                                         sycl::atomic_ref<
+                                                 scalar_t,
+                                                 sycl::memory_order::acq_rel,
+                                                 sycl::memory_scope::device>(
+                                                 global_sum_ptr[27]) += r;
+                                         sycl::atomic_ref<
+                                                 scalar_t,
+                                                 sycl::memory_order::acq_rel,
+                                                 sycl::memory_scope::device>(
+                                                 global_sum_ptr[28]) +=
+                                                 scalar_t(1);
+                                     }
+                                 })
+                            .wait_and_throw();
+                });
+    });
+
+    DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
+}
+
+void ComputePoseColoredICPSYCL(
+        const core::Tensor &source_points,
+        const core::Tensor &source_colors,
+        const core::Tensor &target_points,
+        const core::Tensor &target_normals,
+        const core::Tensor &target_colors,
+        const core::Tensor &target_color_gradients,
+        const core::Tensor &correspondence_indices,
+        core::Tensor &pose,
+        float &residual,
+        int &inlier_count,
+        const core::Dtype &dtype,
+        const core::Device &device,
+        const registration::RobustKernel &kernel,
+        const double &lambda_geometric) {
+    const int n = source_points.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        const scalar_t sqrt_lambda_geometric =
+                static_cast<scalar_t>(sqrt(lambda_geometric));
+        const scalar_t sqrt_lambda_photometric =
+                static_cast<scalar_t>(sqrt(1.0 - lambda_geometric));
+        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
+
+        DISPATCH_ROBUST_KERNEL_FUNCTION(
+                kernel.type_, scalar_t, kernel.scaling_parameter_,
+                kernel.shape_parameter_, [&]() {
+                    const scalar_t *source_points_ptr =
+                            source_points.GetDataPtr<scalar_t>();
+                    const scalar_t *source_colors_ptr =
+                            source_colors.GetDataPtr<scalar_t>();
+                    const scalar_t *target_points_ptr =
+                            target_points.GetDataPtr<scalar_t>();
+                    const scalar_t *target_normals_ptr =
+                            target_normals.GetDataPtr<scalar_t>();
+                    const scalar_t *target_colors_ptr =
+                            target_colors.GetDataPtr<scalar_t>();
+                    const scalar_t *target_color_gradients_ptr =
+                            target_color_gradients.GetDataPtr<scalar_t>();
+                    const int64_t *correspondence_indices_ptr =
+                            correspondence_indices.GetDataPtr<int64_t>();
+
+                    queue.parallel_for(
+                                 sycl::range<1>{(size_t)n},
+                                 [=](sycl::id<1> id) {
+                                     const int workload_idx = id[0];
+                                     scalar_t J_G[6] = {0}, J_I[6] = {0};
+                                     scalar_t r_G = 0, r_I = 0;
+
+                                     const bool valid =
+                                             GetJacobianColoredICP<scalar_t>(
+                                                     workload_idx,
+                                                     source_points_ptr,
+                                                     source_colors_ptr,
+                                                     target_points_ptr,
+                                                     target_normals_ptr,
+                                                     target_colors_ptr,
+                                                     target_color_gradients_ptr,
+                                                     correspondence_indices_ptr,
+                                                     sqrt_lambda_geometric,
+                                                     sqrt_lambda_photometric,
+                                                     J_G, J_I, r_G, r_I);
+
+                                     if (valid) {
+                                         const scalar_t w_G =
+                                                 GetWeightFromRobustKernel(r_G);
+                                         const scalar_t w_I =
+                                                 GetWeightFromRobustKernel(r_I);
+
+                                         int i = 0;
+                                         for (int j = 0; j < 6; ++j) {
+                                             for (int k = 0; k <= j; ++k) {
+                                                 sycl::atomic_ref<
+                                                         scalar_t,
+                                                         sycl::memory_order::
+                                                                 acq_rel,
+                                                         sycl::memory_scope::
+                                                                 device>(
+                                                         global_sum_ptr[i]) +=
+                                                         J_G[j] * w_G *
+                                                                 J_G[k] +
+                                                         J_I[j] * w_I * J_I[k];
+                                                 ++i;
+                                             }
+                                             sycl::atomic_ref<
+                                                     scalar_t,
+                                                     sycl::memory_order::
+                                                             acq_rel,
+                                                     sycl::memory_scope::
+                                                             device>(
+                                                     global_sum_ptr[21 + j]) +=
+                                                     J_G[j] * w_G * r_G +
+                                                     J_I[j] * w_I * r_I;
+                                         }
+                                         sycl::atomic_ref<
+                                                 scalar_t,
+                                                 sycl::memory_order::acq_rel,
+                                                 sycl::memory_scope::device>(
+                                                 global_sum_ptr[27]) +=
+                                                 r_G * r_G + r_I * r_I;
+                                         sycl::atomic_ref<
+                                                 scalar_t,
+                                                 sycl::memory_order::acq_rel,
+                                                 sycl::memory_scope::device>(
+                                                 global_sum_ptr[28]) +=
+                                                 scalar_t(1);
+                                     }
+                                 })
+                            .wait_and_throw();
+                });
+    });
+
+    DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
+}
+
+void ComputePoseDopplerICPSYCL(
+        const core::Tensor &source_points,
+        const core::Tensor &source_dopplers,
+        const core::Tensor &source_directions,
+        const core::Tensor &target_points,
+        const core::Tensor &target_normals,
+        const core::Tensor &correspondence_indices,
+        core::Tensor &output_pose,
+        float &residual,
+        int &inlier_count,
+        const core::Dtype &dtype,
+        const core::Device &device,
+        const core::Tensor &R_S_to_V,
+        const core::Tensor &r_v_to_s_in_V,
+        const core::Tensor &w_v_in_V,
+        const core::Tensor &v_v_in_V,
+        const double period,
+        const bool reject_dynamic_outliers,
+        const double doppler_outlier_threshold,
+        const registration::RobustKernel &kernel_geometric,
+        const registration::RobustKernel &kernel_doppler,
+        const double lambda_doppler) {
+    const int n = source_points.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
+    core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device);
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        const scalar_t sqrt_lambda_geometric =
+                sqrt(1.0 - static_cast<scalar_t>(lambda_doppler));
+        const scalar_t sqrt_lambda_doppler = sqrt(lambda_doppler);
+        const scalar_t sqrt_lambda_doppler_by_dt =
+                sqrt_lambda_doppler / static_cast<scalar_t>(period);
+
+        // Pre-compute v_s_in_S using a single-task kernel.
+        {
+            const scalar_t *R_S_to_V_ptr = R_S_to_V.GetDataPtr<scalar_t>();
+            const scalar_t *r_v_to_s_in_V_ptr =
+                    r_v_to_s_in_V.GetDataPtr<scalar_t>();
+            const scalar_t *w_v_in_V_ptr = w_v_in_V.GetDataPtr<scalar_t>();
+            const scalar_t *v_v_in_V_ptr = v_v_in_V.GetDataPtr<scalar_t>();
+            scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr<scalar_t>();
+            queue.single_task([=]() {
+                     PreComputeForDopplerICP(R_S_to_V_ptr, r_v_to_s_in_V_ptr,
+                                            w_v_in_V_ptr, v_v_in_V_ptr,
+                                            v_s_in_S_ptr);
+                 }).wait_and_throw();
+        }
+
+        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
+
+        DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(
+                scalar_t, kernel_geometric.type_,
+                kernel_geometric.scaling_parameter_, kernel_doppler.type_,
+                kernel_doppler.scaling_parameter_, [&]() {
+                    const scalar_t *source_points_ptr =
+                            source_points.GetDataPtr<scalar_t>();
+                    const scalar_t *source_dopplers_ptr =
+                            source_dopplers.GetDataPtr<scalar_t>();
+                    const scalar_t *source_directions_ptr =
+                            source_directions.GetDataPtr<scalar_t>();
+                    const scalar_t *target_points_ptr =
+                            target_points.GetDataPtr<scalar_t>();
+                    const scalar_t *target_normals_ptr =
+                            target_normals.GetDataPtr<scalar_t>();
+                    const int64_t *correspondence_indices_ptr =
+                            correspondence_indices.GetDataPtr<int64_t>();
+                    const scalar_t *R_S_to_V_ptr =
+                            R_S_to_V.GetDataPtr<scalar_t>();
+                    const scalar_t *r_v_to_s_in_V_ptr =
+                            r_v_to_s_in_V.GetDataPtr<scalar_t>();
+                    const scalar_t *v_s_in_S_ptr =
+                            v_s_in_S.GetDataPtr<scalar_t>();
+
+                    queue.parallel_for(
+                                 sycl::range<1>{(size_t)n},
+                                 [=](sycl::id<1> id) {
+                                     const int workload_idx = id[0];
+                                     scalar_t J_G[6] = {0}, J_D[6] = {0};
+                                     scalar_t r_G = 0, r_D = 0;
+
+                                     const bool valid =
+                                             GetJacobianDopplerICP<scalar_t>(
+                                                     workload_idx,
+                                                     source_points_ptr,
+                                                     source_dopplers_ptr,
+                                                     source_directions_ptr,
+                                                     target_points_ptr,
+                                                     target_normals_ptr,
+                                                     correspondence_indices_ptr,
+                                                     R_S_to_V_ptr,
+                                                     r_v_to_s_in_V_ptr,
+                                                     v_s_in_S_ptr,
+                                                     reject_dynamic_outliers,
+                                                     static_cast<scalar_t>(
+                                                             doppler_outlier_threshold),
+                                                     sqrt_lambda_geometric,
+                                                     sqrt_lambda_doppler,
+                                                     sqrt_lambda_doppler_by_dt,
+                                                     J_G, J_D, r_G, r_D);
+
+                                     if (valid) {
+                                         const scalar_t w_G =
+                                                 GetWeightFromRobustKernelFirst(
+                                                         r_G);
+                                         const scalar_t w_D =
+                                                 GetWeightFromRobustKernelSecond(
+                                                         r_D);
+
+                                         int i = 0;
+                                         for (int j = 0; j < 6; ++j) {
+                                             for (int k = 0; k <= j; ++k) {
+                                                 sycl::atomic_ref<
+                                                         scalar_t,
+                                                         sycl::memory_order::
+                                                                 acq_rel,
+                                                         sycl::memory_scope::
+                                                                 device>(
+                                                         global_sum_ptr[i]) +=
+                                                         J_G[j] * w_G *
+                                                                 J_G[k] +
+                                                         J_D[j] * w_D * J_D[k];
+                                                 ++i;
+                                             }
+                                             sycl::atomic_ref<
+                                                     scalar_t,
+                                                     sycl::memory_order::
+                                                             acq_rel,
+                                                     sycl::memory_scope::
+                                                             device>(
+                                                     global_sum_ptr[21 + j]) +=
+                                                     J_G[j] * w_G * r_G +
+                                                     J_D[j] * w_D * r_D;
+                                         }
+                                         sycl::atomic_ref<
+                                                 scalar_t,
+                                                 sycl::memory_order::acq_rel,
+                                                 sycl::memory_scope::device>(
+                                                 global_sum_ptr[27]) +=
+                                                 r_G * r_G + r_D * r_D;
+                                         sycl::atomic_ref<
+                                                 scalar_t,
+                                                 sycl::memory_order::acq_rel,
+                                                 sycl::memory_scope::device>(
+                                                 global_sum_ptr[28]) +=
+                                                 scalar_t(1);
+                                     }
+                                 })
+                            .wait_and_throw();
+                });
+    });
+
+    DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count);
+}
+
+void ComputeInformationMatrixSYCL(
+        const core::Tensor &target_points,
+        const core::Tensor &correspondence_indices,
+        core::Tensor &information_matrix,
+        const core::Dtype &dtype,
+        const core::Device &device) {
+    const int n = correspondence_indices.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device);
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
+        const scalar_t *target_points_ptr =
+                target_points.GetDataPtr<scalar_t>();
+        const int64_t *correspondence_indices_ptr =
+                correspondence_indices.GetDataPtr<int64_t>();
+
+        queue.parallel_for(
+                     sycl::range<1>{(size_t)n},
+                     [=](sycl::id<1> id) {
+                         const int workload_idx = id[0];
+                         scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
+                         const bool valid = GetInformationJacobians<scalar_t>(
+                                 workload_idx, target_points_ptr,
+                                 correspondence_indices_ptr, J_x, J_y, J_z);
+
+                         if (valid) {
+                             int i = 0;
+                             for (int j = 0; j < 6; ++j) {
+                                 for (int k = 0; k <= j; ++k) {
+                                     sycl::atomic_ref<
+                                             scalar_t,
+                                             sycl::memory_order::acq_rel,
+                                             sycl::memory_scope::device>(
+                                             global_sum_ptr[i]) +=
+                                             J_x[j] * J_x[k] +
+                                             J_y[j] * J_y[k] +
+                                             J_z[j] * J_z[k];
+                                     ++i;
+                                 }
+                             }
+                         }
+                     })
+                .wait_and_throw();
+
+        const core::Device host(core::Device("CPU:0"));
+        core::Tensor global_sum_cpu = global_sum.To(host, core::Float64);
+        double *sum_ptr = global_sum_cpu.GetDataPtr<double>();
+
+        // Information matrix is on CPU of type Float64.
+        double *GTG_ptr = information_matrix.GetDataPtr<double>();
+
+        int i = 0;
+        for (int j = 0; j < 6; j++) {
+            for (int k = 0; k <= j; k++) {
+                GTG_ptr[j * 6 + k] = GTG_ptr[k * 6 + j] = sum_ptr[i];
+                ++i;
+            }
+        }
+    });
+}
+
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp
index c181ade770e..5526e2f1f59 100644
--- a/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp
+++ b/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp
@@ -63,6 +63,13 @@ static void PoseToTransformationDevice(
         PoseToTransformationCUDA<scalar_t>(transformation_ptr, pose_ptr);
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (device_type == core::Device::DeviceType::SYCL) {
+#ifdef BUILD_SYCL_MODULE
+        PoseToTransformationSYCL<scalar_t>(transformation_ptr, pose_ptr,
+                                           transformation.GetDevice());
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device.");
@@ -109,6 +116,13 @@ static void TransformationToPoseDevice(
         TransformationToPoseCUDA<scalar_t>(pose_ptr, transformation_ptr);
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (device_type == core::Device::DeviceType::SYCL) {
+#ifdef BUILD_SYCL_MODULE
+        TransformationToPoseSYCL<scalar_t>(pose_ptr, transformation_ptr,
+                                           pose.GetDevice());
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device.");
diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
index e75484ce3f5..02cc741d62e 100644
--- a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
@@ -77,6 +77,26 @@ void TransformationToPoseCUDA(scalar_t *pose_ptr,
                               const scalar_t *transformation_ptr);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+/// \brief Helper function for PoseToTransformationSYCL.
+/// Do not call this independently, as it only sets the transformation part
+/// in transformation matrix, using the Pose, the rest is set in
+/// the parent function PoseToTransformation.
+template <typename scalar_t>
+void PoseToTransformationSYCL(scalar_t *transformation_ptr,
+                               const scalar_t *pose_ptr,
+                               const core::Device &device);
+
+/// \brief Helper function for TransformationToPoseSYCL.
+/// Do not call this independently, as it only sets the rotation part in the
+/// pose, using the Transformation, the rest is set in the parent function
+/// TransformationToPose.
+template <typename scalar_t>
+void TransformationToPoseSYCL(scalar_t *pose_ptr,
+                               const scalar_t *transformation_ptr,
+                               const core::Device &device);
+#endif
+
 }  // namespace kernel
 }  // namespace pipelines
 }  // namespace t
diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp
new file mode 100644
index 00000000000..1ec3e943a3c
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp
@@ -0,0 +1,56 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Device.h"
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/t/pipelines/kernel/TransformationConverterImpl.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+
+template <typename scalar_t>
+void PoseToTransformationSYCL(scalar_t *transformation_ptr,
+                               const scalar_t *pose_ptr,
+                               const core::Device &device) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.single_task([=]() {
+             PoseToTransformationImpl(transformation_ptr, pose_ptr);
+         }).wait_and_throw();
+}
+
+template <typename scalar_t>
+void TransformationToPoseSYCL(scalar_t *pose_ptr,
+                               const scalar_t *transformation_ptr,
+                               const core::Device &device) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.single_task([=]() {
+             TransformationToPoseImpl(pose_ptr, transformation_ptr);
+         }).wait_and_throw();
+}
+
+template void PoseToTransformationSYCL<float>(float *,
+                                              const float *,
+                                              const core::Device &);
+template void PoseToTransformationSYCL<double>(double *,
+                                               const double *,
+                                               const core::Device &);
+template void TransformationToPoseSYCL<float>(float *,
+                                              const float *,
+                                              const core::Device &);
+template void TransformationToPoseSYCL<double>(double *,
+                                               const double *,
+                                               const core::Device &);
+
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d

From f547842fb1c688772ff18e59a7b1d0c2db645037 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 20 Feb 2026 13:04:52 +0000
Subject: [PATCH 03/14] Fix RGBDOdometryImpl.h: source_normal_map ->
 target_normal_map in SYCL declaration

Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com>
---
 cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
index ff854a406e2..22955e17ab6 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
@@ -129,7 +129,7 @@ void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_depth,
 void ComputeOdometryResultPointToPlaneSYCL(
         const core::Tensor& source_vertex_map,
         const core::Tensor& target_vertex_map,
-        const core::Tensor& source_normal_map,
+        const core::Tensor& target_normal_map,
         const core::Tensor& intrinsics,
         const core::Tensor& init_source_to_target,
         core::Tensor& delta,

From 83aa8d94c616dae820facb36f8e2923da0c3fe18 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 23 Feb 2026 21:49:49 +0000
Subject: [PATCH 04/14] Fix ubuntu-sycl CI: use open3d_sycl_target_sources to
 add -fsycl flag to SYCL kernel files

Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com>
---
 cpp/open3d/t/pipelines/kernel/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
index 2a9478b4be9..785d58c85e4 100644
--- a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
@@ -23,7 +23,7 @@ if (BUILD_CUDA_MODULE)
 endif()
 
 if (BUILD_SYCL_MODULE)
-    target_sources(tpipelines_kernel PRIVATE
+    open3d_sycl_target_sources(tpipelines_kernel PRIVATE
         RegistrationSYCL.cpp
         FillInLinearSystemSYCL.cpp
         RGBDOdometrySYCL.cpp

From 7b846bb7511d2c307574d24838fd97e08e48883e Mon Sep 17 00:00:00 2001
From: Sameer Sheorey <sameer.sheorey@intel.com>
Date: Fri, 6 Mar 2026 14:19:24 -0800
Subject: [PATCH 05/14] style fix

---
 cpp/open3d/t/pipelines/kernel/Feature.cpp     |   3 +-
 cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp | 173 +++---
 .../t/pipelines/kernel/FillInLinearSystem.cpp |  14 +-
 .../t/pipelines/kernel/FillInLinearSystem.h   |  54 +-
 .../kernel/FillInLinearSystemSYCL.cpp         | 579 ++++++++----------
 .../t/pipelines/kernel/RGBDOdometry.cpp       |   7 +-
 .../t/pipelines/kernel/RGBDOdometryImpl.h     |  42 +-
 .../t/pipelines/kernel/RGBDOdometrySYCL.cpp   | 343 +++++------
 .../t/pipelines/kernel/Registration.cpp       |   7 +-
 .../t/pipelines/kernel/RegistrationImpl.h     |  52 +-
 .../t/pipelines/kernel/RegistrationSYCL.cpp   | 159 +++--
 .../kernel/TransformationConverterImpl.h      |   8 +-
 .../kernel/TransformationConverterSYCL.cpp    |   8 +-
 13 files changed, 663 insertions(+), 786 deletions(-)

diff --git a/cpp/open3d/t/pipelines/kernel/Feature.cpp b/cpp/open3d/t/pipelines/kernel/Feature.cpp
index 4677396f1d7..5a779ecc48b 100644
--- a/cpp/open3d/t/pipelines/kernel/Feature.cpp
+++ b/cpp/open3d/t/pipelines/kernel/Feature.cpp
@@ -51,7 +51,8 @@ void ComputeFPFHFeature(
     } else if (points_d.IsSYCL()) {
 #ifdef BUILD_SYCL_MODULE
         ComputeFPFHFeatureSYCL(points_d, normals_d, indices, distance2,
-                               counts_d, fpfhs, mask, map_info_idx_to_point_idx);
+                               counts_d, fpfhs, mask,
+                               map_info_idx_to_point_idx);
 #else
         utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
diff --git a/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp
index 7338edc63f8..8c41acf7aab 100644
--- a/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp
@@ -116,109 +116,86 @@ void ComputeFPFHFeatureSYCL(
                 map_point_idx_to_spfh_idx.GetDataPtr<int64_t>();
 
         // Compute SPFH features for the points.
-        queue.parallel_for(
-                     sycl::range<1>{(size_t)n_spfh},
-                     [=](sycl::id<1> id) {
-                         int64_t workload_idx = id[0];
-                         int64_t workload_point_idx =
-                                 filter_fpfh
-                                         ? map_spfh_info_idx_to_point_idx_ptr
-                                                   [workload_idx]
-                                         : workload_idx;
-                         int64_t idx = 3 * workload_point_idx;
-                         const scalar_t *point = points_ptr + idx;
-                         const scalar_t *normal = normals_ptr + idx;
-
-                         const int indice_size =
+        queue.parallel_for(sycl::range<1>{(size_t)n_spfh}, [=](sycl::id<1> id) {
+                 int64_t workload_idx = id[0];
+                 int64_t workload_point_idx =
+                         filter_fpfh ? map_spfh_info_idx_to_point_idx_ptr
+                                               [workload_idx]
+                                     : workload_idx;
+                 int64_t idx = 3 * workload_point_idx;
+                 const scalar_t *point = points_ptr + idx;
+                 const scalar_t *normal = normals_ptr + idx;
+
+                 const int indice_size =
+                         is_radius_search ? (counts_ptr[workload_idx + 1] -
+                                             counts_ptr[workload_idx])
+                                          : counts_ptr[workload_idx];
+
+                 if (indice_size > 1) {
+                     const scalar_t hist_incr =
+                             100.0 / static_cast<scalar_t>(indice_size - 1);
+                     for (int i = 1; i < indice_size; i++) {
+                         const int point_idx =
                                  is_radius_search
-                                         ? (counts_ptr[workload_idx + 1] -
-                                            counts_ptr[workload_idx])
-                                         : counts_ptr[workload_idx];
-
-                         if (indice_size > 1) {
-                             const scalar_t hist_incr =
-                                     100.0 / static_cast<scalar_t>(
-                                                     indice_size - 1);
-                             for (int i = 1; i < indice_size; i++) {
-                                 const int point_idx =
-                                         is_radius_search
-                                                 ? indices_ptr
-                                                           [i +
-                                                            counts_ptr
-                                                                    [workload_idx]]
-                                                 : indices_ptr[workload_idx *
-                                                                       nn_size +
-                                                               i];
-
-                                 const scalar_t *point_ref =
-                                         points_ptr + 3 * point_idx;
-                                 const scalar_t *normal_ref =
-                                         normals_ptr + 3 * point_idx;
-                                 scalar_t fea[4] = {0};
-                                 ComputePairFeature<scalar_t>(
-                                         point, normal, point_ref, normal_ref,
-                                         fea);
-                                 UpdateSPFHFeature<scalar_t>(
-                                         fea, workload_idx, hist_incr,
-                                         spfhs_ptr);
-                             }
-                         }
-                     })
-                .wait_and_throw();
+                                         ? indices_ptr[i +
+                                                       counts_ptr[workload_idx]]
+                                         : indices_ptr[workload_idx * nn_size +
+                                                       i];
+
+                         const scalar_t *point_ref = points_ptr + 3 * point_idx;
+                         const scalar_t *normal_ref =
+                                 normals_ptr + 3 * point_idx;
+                         scalar_t fea[4] = {0};
+                         ComputePairFeature<scalar_t>(point, normal, point_ref,
+                                                      normal_ref, fea);
+                         UpdateSPFHFeature<scalar_t>(fea, workload_idx,
+                                                     hist_incr, spfhs_ptr);
+                     }
+                 }
+             }).wait_and_throw();
 
         // Compute FPFH features for the points.
-        queue.parallel_for(
-                     sycl::range<1>{(size_t)n_fpfh},
-                     [=](sycl::id<1> id) {
-                         int64_t workload_idx = id[0];
-                         int64_t workload_spfh_idx =
-                                 filter_fpfh
-                                         ? map_point_idx_to_spfh_idx_ptr
-                                                   [map_fpfh_idx_to_point_idx_ptr
-                                                            [workload_idx]]
-                                         : workload_idx;
-                         const int indice_size =
+        queue.parallel_for(sycl::range<1>{(size_t)n_fpfh}, [=](sycl::id<1> id) {
+                 int64_t workload_idx = id[0];
+                 int64_t workload_spfh_idx =
+                         filter_fpfh ? map_point_idx_to_spfh_idx_ptr
+                                               [map_fpfh_idx_to_point_idx_ptr
+                                                        [workload_idx]]
+                                     : workload_idx;
+                 const int indice_size =
+                         is_radius_search ? (counts_ptr[workload_spfh_idx + 1] -
+                                             counts_ptr[workload_spfh_idx])
+                                          : counts_ptr[workload_spfh_idx];
+                 if (indice_size > 1) {
+                     scalar_t sum[3] = {0.0, 0.0, 0.0};
+                     for (int i = 1; i < indice_size; i++) {
+                         const int idx =
                                  is_radius_search
-                                         ? (counts_ptr[workload_spfh_idx + 1] -
-                                            counts_ptr[workload_spfh_idx])
-                                         : counts_ptr[workload_spfh_idx];
-                         if (indice_size > 1) {
-                             scalar_t sum[3] = {0.0, 0.0, 0.0};
-                             for (int i = 1; i < indice_size; i++) {
-                                 const int idx =
-                                         is_radius_search
-                                                 ? i +
-                                                           counts_ptr
-                                                                   [workload_spfh_idx]
-                                                 : workload_spfh_idx * nn_size +
-                                                           i;
-                                 const scalar_t dist = distance2_ptr[idx];
-                                 if (dist == 0.0) continue;
-                                 const int32_t spfh_idx =
-                                         filter_fpfh
-                                                 ? map_point_idx_to_spfh_idx_ptr
-                                                           [indices_ptr[idx]]
-                                                 : indices_ptr[idx];
-                                 for (int j = 0; j < 33; j++) {
-                                     const scalar_t val =
-                                             spfhs_ptr[spfh_idx * 33 + j] /
-                                             dist;
-                                     sum[j / 11] += val;
-                                     fpfhs_ptr[workload_idx * 33 + j] += val;
-                                 }
-                             }
-                             for (int j = 0; j < 3; j++) {
-                                 sum[j] = sum[j] != 0.0 ? 100.0 / sum[j] : 0.0;
-                             }
-                             for (int j = 0; j < 33; j++) {
-                                 fpfhs_ptr[workload_idx * 33 + j] *=
-                                         sum[j / 11];
-                                 fpfhs_ptr[workload_idx * 33 + j] +=
-                                         spfhs_ptr[workload_spfh_idx * 33 + j];
-                             }
+                                         ? i + counts_ptr[workload_spfh_idx]
+                                         : workload_spfh_idx * nn_size + i;
+                         const scalar_t dist = distance2_ptr[idx];
+                         if (dist == 0.0) continue;
+                         const int32_t spfh_idx =
+                                 filter_fpfh ? map_point_idx_to_spfh_idx_ptr
+                                                       [indices_ptr[idx]]
+                                             : indices_ptr[idx];
+                         for (int j = 0; j < 33; j++) {
+                             const scalar_t val =
+                                     spfhs_ptr[spfh_idx * 33 + j] / dist;
+                             sum[j / 11] += val;
+                             fpfhs_ptr[workload_idx * 33 + j] += val;
                          }
-                     })
-                .wait_and_throw();
+                     }
+                     for (int j = 0; j < 3; j++) {
+                         sum[j] = sum[j] != 0.0 ? 100.0 / sum[j] : 0.0;
+                     }
+                     for (int j = 0; j < 33; j++) {
+                         fpfhs_ptr[workload_idx * 33 + j] *= sum[j / 11];
+                         fpfhs_ptr[workload_idx * 33 + j] +=
+                                 spfhs_ptr[workload_spfh_idx * 33 + j];
+                     }
+                 }
+             }).wait_and_throw();
     });
 }
 
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
index 82a2c33ac93..f9e358d6636 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
@@ -134,10 +134,9 @@ void FillInSLACAlignmentTerm(core::Tensor &AtA,
 #endif
     } else if (AtA.IsSYCL()) {
 #ifdef BUILD_SYCL_MODULE
-        FillInSLACAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs,
-                                    normal_ps, Ri_normal_ps,
-                                    RjT_Ri_normal_ps, cgrid_idx_ps,
-                                    cgrid_idx_qs, cgrid_ratio_ps,
+        FillInSLACAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs, normal_ps,
+                                    Ri_normal_ps, RjT_Ri_normal_ps,
+                                    cgrid_idx_ps, cgrid_idx_qs, cgrid_ratio_ps,
                                     cgrid_ratio_qs, i, j, n, threshold);
 #else
         utility::LogError("Not compiled with SYCL, but SYCL device is used.");
@@ -183,10 +182,9 @@ void FillInSLACRegularizerTerm(core::Tensor &AtA,
 #endif
     } else if (AtA.IsSYCL()) {
 #ifdef BUILD_SYCL_MODULE
-        FillInSLACRegularizerTermSYCL(AtA, Atb, residual, grid_idx,
-                                      grid_nbs_idx, grid_nbs_mask,
-                                      positions_init, positions_curr, weight, n,
-                                      anchor_idx);
+        FillInSLACRegularizerTermSYCL(
+                AtA, Atb, residual, grid_idx, grid_nbs_idx, grid_nbs_mask,
+                positions_init, positions_curr, weight, n, anchor_idx);
 #else
         utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
index 91014b84dd2..69da827ab61 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
@@ -133,43 +133,43 @@ void FillInSLACRegularizerTermCUDA(core::Tensor &AtA,
 #endif
 #ifdef BUILD_SYCL_MODULE
 void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
-                                   core::Tensor &Atb,
-                                   core::Tensor &residual,
-                                   const core::Tensor &Ti_qs,
-                                   const core::Tensor &Tj_qs,
-                                   const core::Tensor &Ri_normal_ps,
-                                   int i,
-                                   int j,
-                                   float threshold);
-
-void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
                                   core::Tensor &Atb,
                                   core::Tensor &residual,
                                   const core::Tensor &Ti_qs,
                                   const core::Tensor &Tj_qs,
-                                  const core::Tensor &normal_ps,
                                   const core::Tensor &Ri_normal_ps,
-                                  const core::Tensor &RjT_Ri_normal_ps,
-                                  const core::Tensor &cgrid_idx_ps,
-                                  const core::Tensor &cgrid_idx_qs,
-                                  const core::Tensor &cgrid_ratio_qs,
-                                  const core::Tensor &cgrid_ratio_ps,
                                   int i,
                                   int j,
-                                  int n,
                                   float threshold);
 
+void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
+                                 core::Tensor &Atb,
+                                 core::Tensor &residual,
+                                 const core::Tensor &Ti_qs,
+                                 const core::Tensor &Tj_qs,
+                                 const core::Tensor &normal_ps,
+                                 const core::Tensor &Ri_normal_ps,
+                                 const core::Tensor &RjT_Ri_normal_ps,
+                                 const core::Tensor &cgrid_idx_ps,
+                                 const core::Tensor &cgrid_idx_qs,
+                                 const core::Tensor &cgrid_ratio_qs,
+                                 const core::Tensor &cgrid_ratio_ps,
+                                 int i,
+                                 int j,
+                                 int n,
+                                 float threshold);
+
 void FillInSLACRegularizerTermSYCL(core::Tensor &AtA,
-                                    core::Tensor &Atb,
-                                    core::Tensor &residual,
-                                    const core::Tensor &grid_idx,
-                                    const core::Tensor &grid_nbs_idx,
-                                    const core::Tensor &grid_nbs_mask,
-                                    const core::Tensor &positions_init,
-                                    const core::Tensor &positions_curr,
-                                    float weight,
-                                    int n,
-                                    int anchor_idx);
+                                   core::Tensor &Atb,
+                                   core::Tensor &residual,
+                                   const core::Tensor &grid_idx,
+                                   const core::Tensor &grid_nbs_idx,
+                                   const core::Tensor &grid_nbs_mask,
+                                   const core::Tensor &positions_init,
+                                   const core::Tensor &positions_curr,
+                                   float weight,
+                                   int n,
+                                   int anchor_idx);
 
 #endif
 
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
index e99b7f5ccb6..2f16deec4e8 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
@@ -21,14 +21,14 @@ namespace pipelines {
 namespace kernel {
 
 void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
-                                   core::Tensor &Atb,
-                                   core::Tensor &residual,
-                                   const core::Tensor &Ti_ps,
-                                   const core::Tensor &Tj_qs,
-                                   const core::Tensor &Ri_normal_ps,
-                                   int i,
-                                   int j,
-                                   float threshold) {
+                                  core::Tensor &Atb,
+                                  core::Tensor &residual,
+                                  const core::Tensor &Ti_ps,
+                                  const core::Tensor &Tj_qs,
+                                  const core::Tensor &Ri_normal_ps,
+                                  int i,
+                                  int j,
+                                  float threshold) {
     core::Device device = AtA.GetDevice();
     int64_t n = Ti_ps.GetLength();
     if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
@@ -52,54 +52,46 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
 
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(
-                 sycl::range<1>{(size_t)n},
-                 [=](sycl::id<1> id) {
-                     int64_t workload_idx = id[0];
-                     const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
-                     const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
-                     const float *normal_p_prime =
-                             Ri_normal_ps_ptr + 3 * workload_idx;
-
-                     float r = (p_prime[0] - q_prime[0]) *
-                                       normal_p_prime[0] +
-                               (p_prime[1] - q_prime[1]) *
-                                       normal_p_prime[1] +
-                               (p_prime[2] - q_prime[2]) * normal_p_prime[2];
-                     if (sycl::fabs(r) > threshold) return;
-
-                     float J_ij[12];
-                     J_ij[0] = -q_prime[2] * normal_p_prime[1] +
-                               q_prime[1] * normal_p_prime[2];
-                     J_ij[1] = q_prime[2] * normal_p_prime[0] -
-                               q_prime[0] * normal_p_prime[2];
-                     J_ij[2] = -q_prime[1] * normal_p_prime[0] +
-                               q_prime[0] * normal_p_prime[1];
-                     J_ij[3] = normal_p_prime[0];
-                     J_ij[4] = normal_p_prime[1];
-                     J_ij[5] = normal_p_prime[2];
-                     for (int k = 0; k < 6; ++k) {
-                         J_ij[k + 6] = -J_ij[k];
-                     }
-
-                     for (int i_local = 0; i_local < 12; ++i_local) {
-                         for (int j_local = 0; j_local < 12; ++j_local) {
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     AtA_local_ptr[i_local * 12 + j_local]) +=
-                                     J_ij[i_local] * J_ij[j_local];
-                         }
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 Atb_local_ptr[i_local]) +=
-                                 J_ij[i_local] * r;
-                     }
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             int64_t workload_idx = id[0];
+             const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
+             const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
+             const float *normal_p_prime = Ri_normal_ps_ptr + 3 * workload_idx;
+
+             float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] +
+                       (p_prime[1] - q_prime[1]) * normal_p_prime[1] +
+                       (p_prime[2] - q_prime[2]) * normal_p_prime[2];
+             if (sycl::fabs(r) > threshold) return;
+
+             float J_ij[12];
+             J_ij[0] = -q_prime[2] * normal_p_prime[1] +
+                       q_prime[1] * normal_p_prime[2];
+             J_ij[1] = q_prime[2] * normal_p_prime[0] -
+                       q_prime[0] * normal_p_prime[2];
+             J_ij[2] = -q_prime[1] * normal_p_prime[0] +
+                       q_prime[0] * normal_p_prime[1];
+             J_ij[3] = normal_p_prime[0];
+             J_ij[4] = normal_p_prime[1];
+             J_ij[5] = normal_p_prime[2];
+             for (int k = 0; k < 6; ++k) {
+                 J_ij[k + 6] = -J_ij[k];
+             }
+
+             for (int i_local = 0; i_local < 12; ++i_local) {
+                 for (int j_local = 0; j_local < 12; ++j_local) {
                      sycl::atomic_ref<float, sycl::memory_order::acq_rel,
                                       sycl::memory_scope::device>(
-                             *residual_ptr) += r * r;
-                 })
-            .wait_and_throw();
+                             AtA_local_ptr[i_local * 12 + j_local]) +=
+                             J_ij[i_local] * J_ij[j_local];
+                 }
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         Atb_local_ptr[i_local]) += J_ij[i_local] * r;
+             }
+             sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                              sycl::memory_scope::device>(*residual_ptr) +=
+                     r * r;
+         }).wait_and_throw();
 
     // Then fill-in the large linear system.
     std::vector<int64_t> indices_vec(12);
@@ -129,25 +121,24 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
 }
 
 void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
-                                  core::Tensor &Atb,
-                                  core::Tensor &residual,
-                                  const core::Tensor &Ti_Cps,
-                                  const core::Tensor &Tj_Cqs,
-                                  const core::Tensor &Cnormal_ps,
-                                  const core::Tensor &Ri_Cnormal_ps,
-                                  const core::Tensor &RjT_Ri_Cnormal_ps,
-                                  const core::Tensor &cgrid_idx_ps,
-                                  const core::Tensor &cgrid_idx_qs,
-                                  const core::Tensor &cgrid_ratio_qs,
-                                  const core::Tensor &cgrid_ratio_ps,
-                                  int i,
-                                  int j,
-                                  int n_frags,
-                                  float threshold) {
+                                 core::Tensor &Atb,
+                                 core::Tensor &residual,
+                                 const core::Tensor &Ti_Cps,
+                                 const core::Tensor &Tj_Cqs,
+                                 const core::Tensor &Cnormal_ps,
+                                 const core::Tensor &Ri_Cnormal_ps,
+                                 const core::Tensor &RjT_Ri_Cnormal_ps,
+                                 const core::Tensor &cgrid_idx_ps,
+                                 const core::Tensor &cgrid_idx_qs,
+                                 const core::Tensor &cgrid_ratio_qs,
+                                 const core::Tensor &cgrid_ratio_ps,
+                                 int i,
+                                 int j,
+                                 int n_frags,
+                                 float threshold) {
     int64_t n = Ti_Cps.GetLength();
     if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
-        Ri_Cnormal_ps.GetLength() != n ||
-        RjT_Ri_Cnormal_ps.GetLength() != n ||
+        Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n ||
         cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
         cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
         utility::LogError(
@@ -160,10 +151,8 @@ void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
     float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
     float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
 
-    const float *Ti_Cps_ptr =
-            static_cast<const float *>(Ti_Cps.GetDataPtr());
-    const float *Tj_Cqs_ptr =
-            static_cast<const float *>(Tj_Cqs.GetDataPtr());
+    const float *Ti_Cps_ptr = static_cast<const float *>(Ti_Cps.GetDataPtr());
+    const float *Tj_Cqs_ptr = static_cast<const float *>(Tj_Cqs.GetDataPtr());
     const float *Cnormal_ps_ptr =
             static_cast<const float *>(Cnormal_ps.GetDataPtr());
     const float *Ri_Cnormal_ps_ptr =
@@ -182,113 +171,88 @@ void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
 
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(
-                 sycl::range<1>{(size_t)n},
-                 [=](sycl::id<1> id) {
-                     int64_t workload_idx = id[0];
-                     const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
-                     const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
-                     const float *Cnormal_p =
-                             Cnormal_ps_ptr + 3 * workload_idx;
-                     const float *Ri_Cnormal_p =
-                             Ri_Cnormal_ps_ptr + 3 * workload_idx;
-                     const float *RjTRi_Cnormal_p =
-                             RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
-
-                     const int *cgrid_idx_p =
-                             cgrid_idx_ps_ptr + 8 * workload_idx;
-                     const int *cgrid_idx_q =
-                             cgrid_idx_qs_ptr + 8 * workload_idx;
-                     const float *cgrid_ratio_p =
-                             cgrid_ratio_ps_ptr + 8 * workload_idx;
-                     const float *cgrid_ratio_q =
-                             cgrid_ratio_qs_ptr + 8 * workload_idx;
-
-                     float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
-                               (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
-                               (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
-                     if (sycl::fabs(r) > threshold) return;
-
-                     float J[60];
-                     int idx[60];
-
-                     J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] +
-                            Tj_Cq[1] * Ri_Cnormal_p[2];
-                     J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] -
-                            Tj_Cq[0] * Ri_Cnormal_p[2];
-                     J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] +
-                            Tj_Cq[0] * Ri_Cnormal_p[1];
-                     J[3] = Ri_Cnormal_p[0];
-                     J[4] = Ri_Cnormal_p[1];
-                     J[5] = Ri_Cnormal_p[2];
-
-                     for (int k = 0; k < 6; ++k) {
-                         J[k + 6] = -J[k];
-                         idx[k + 0] = 6 * i + k;
-                         idx[k + 6] = 6 * j + k;
-                     }
-
-                     for (int k = 0; k < 8; ++k) {
-                         J[12 + k * 3 + 0] =
-                                 cgrid_ratio_p[k] * Cnormal_p[0];
-                         J[12 + k * 3 + 1] =
-                                 cgrid_ratio_p[k] * Cnormal_p[1];
-                         J[12 + k * 3 + 2] =
-                                 cgrid_ratio_p[k] * Cnormal_p[2];
-                         idx[12 + k * 3 + 0] =
-                                 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
-                         idx[12 + k * 3 + 1] =
-                                 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
-                         idx[12 + k * 3 + 2] =
-                                 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
-                     }
-
-                     for (int k = 0; k < 8; ++k) {
-                         J[36 + k * 3 + 0] =
-                                 -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
-                         J[36 + k * 3 + 1] =
-                                 -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
-                         J[36 + k * 3 + 2] =
-                                 -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
-                         idx[36 + k * 3 + 0] =
-                                 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
-                         idx[36 + k * 3 + 1] =
-                                 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
-                         idx[36 + k * 3 + 2] =
-                                 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
-                     }
-
-                     for (int ki = 0; ki < 60; ++ki) {
-                         for (int kj = 0; kj < 60; ++kj) {
-                             float AtA_ij = J[ki] * J[kj];
-                             int ij = idx[ki] * n_vars + idx[kj];
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     AtA_ptr[ij]) += AtA_ij;
-                         }
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 Atb_ptr[idx[ki]]) += J[ki] * r;
-                     }
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             int64_t workload_idx = id[0];
+             const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
+             const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
+             const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
+             const float *Ri_Cnormal_p = Ri_Cnormal_ps_ptr + 3 * workload_idx;
+             const float *RjTRi_Cnormal_p =
+                     RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
+
+             const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx;
+             const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx;
+             const float *cgrid_ratio_p = cgrid_ratio_ps_ptr + 8 * workload_idx;
+             const float *cgrid_ratio_q = cgrid_ratio_qs_ptr + 8 * workload_idx;
+
+             float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
+                       (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
+                       (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
+             if (sycl::fabs(r) > threshold) return;
+
+             float J[60];
+             int idx[60];
+
+             J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2];
+             J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2];
+             J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1];
+             J[3] = Ri_Cnormal_p[0];
+             J[4] = Ri_Cnormal_p[1];
+             J[5] = Ri_Cnormal_p[2];
+
+             for (int k = 0; k < 6; ++k) {
+                 J[k + 6] = -J[k];
+                 idx[k + 0] = 6 * i + k;
+                 idx[k + 6] = 6 * j + k;
+             }
+
+             for (int k = 0; k < 8; ++k) {
+                 J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0];
+                 J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1];
+                 J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2];
+                 idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
+                 idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
+                 idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
+             }
+
+             for (int k = 0; k < 8; ++k) {
+                 J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
+                 J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
+                 J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
+                 idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
+                 idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
+                 idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
+             }
+
+             for (int ki = 0; ki < 60; ++ki) {
+                 for (int kj = 0; kj < 60; ++kj) {
+                     float AtA_ij = J[ki] * J[kj];
+                     int ij = idx[ki] * n_vars + idx[kj];
                      sycl::atomic_ref<float, sycl::memory_order::acq_rel,
                                       sycl::memory_scope::device>(
-                             *residual_ptr) += r * r;
-                 })
-            .wait_and_throw();
+                             AtA_ptr[ij]) += AtA_ij;
+                 }
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         Atb_ptr[idx[ki]]) += J[ki] * r;
+             }
+             sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                              sycl::memory_scope::device>(*residual_ptr) +=
+                     r * r;
+         }).wait_and_throw();
 }
 
 void FillInSLACRegularizerTermSYCL(core::Tensor &AtA,
-                                    core::Tensor &Atb,
-                                    core::Tensor &residual,
-                                    const core::Tensor &grid_idx,
-                                    const core::Tensor &grid_nbs_idx,
-                                    const core::Tensor &grid_nbs_mask,
-                                    const core::Tensor &positions_init,
-                                    const core::Tensor &positions_curr,
-                                    float weight,
-                                    int n_frags,
-                                    int anchor_idx) {
+                                   core::Tensor &Atb,
+                                   core::Tensor &residual,
+                                   const core::Tensor &grid_idx,
+                                   const core::Tensor &grid_nbs_idx,
+                                   const core::Tensor &grid_nbs_mask,
+                                   const core::Tensor &positions_init,
+                                   const core::Tensor &positions_curr,
+                                   float weight,
+                                   int n_frags,
+                                   int anchor_idx) {
     int64_t n = grid_idx.GetLength();
     int64_t n_vars = Atb.GetLength();
 
@@ -297,8 +261,7 @@ void FillInSLACRegularizerTermSYCL(core::Tensor &AtA,
     float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
     float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
 
-    const int *grid_idx_ptr =
-            static_cast<const int *>(grid_idx.GetDataPtr());
+    const int *grid_idx_ptr = static_cast<const int *>(grid_idx.GetDataPtr());
     const int *grid_nbs_idx_ptr =
             static_cast<const int *>(grid_nbs_idx.GetDataPtr());
     const bool *grid_nbs_mask_ptr =
@@ -310,144 +273,130 @@ void FillInSLACRegularizerTermSYCL(core::Tensor &AtA,
 
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(
-                 sycl::range<1>{(size_t)n},
-                 [=](sycl::id<1> id) {
-                     int64_t workload_idx = id[0];
-                     int idx_i = grid_idx_ptr[workload_idx];
-
-                     const int *idx_nbs =
-                             grid_nbs_idx_ptr + 6 * workload_idx;
-                     const bool *mask_nbs =
-                             grid_nbs_mask_ptr + 6 * workload_idx;
-
-                     float cov[3][3] = {{0}};
-                     float U[3][3], V[3][3], S[3];
-
-                     int cnt = 0;
-                     for (int k = 0; k < 6; ++k) {
-                         if (!mask_nbs[k]) continue;
-                         int idx_k = idx_nbs[k];
-
-                         float diff_ik_init[3] = {
-                                 positions_init_ptr[idx_i * 3 + 0] -
-                                         positions_init_ptr[idx_k * 3 + 0],
-                                 positions_init_ptr[idx_i * 3 + 1] -
-                                         positions_init_ptr[idx_k * 3 + 1],
-                                 positions_init_ptr[idx_i * 3 + 2] -
-                                         positions_init_ptr[idx_k * 3 + 2]};
-                         float diff_ik_curr[3] = {
-                                 positions_curr_ptr[idx_i * 3 + 0] -
-                                         positions_curr_ptr[idx_k * 3 + 0],
-                                 positions_curr_ptr[idx_i * 3 + 1] -
-                                         positions_curr_ptr[idx_k * 3 + 1],
-                                 positions_curr_ptr[idx_i * 3 + 2] -
-                                         positions_curr_ptr[idx_k * 3 + 2]};
-
-                         for (int ii = 0; ii < 3; ++ii) {
-                             for (int jj = 0; jj < 3; ++jj) {
-                                 cov[ii][jj] += diff_ik_init[ii] *
-                                                diff_ik_curr[jj];
-                             }
-                         }
-                         ++cnt;
-                     }
-
-                     if (cnt < 3) return;
-
-                     core::linalg::kernel::svd3x3(*cov, *U, S, *V);
-
-                     float R[3][3];
-                     core::linalg::kernel::transpose3x3_(*U);
-                     core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
-
-                     float d = core::linalg::kernel::det3x3(*R);
-                     if (d < 0) {
-                         U[2][0] = -U[2][0];
-                         U[2][1] = -U[2][1];
-                         U[2][2] = -U[2][2];
-                         core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
-                     }
-
-                     if (idx_i == anchor_idx) {
-                         R[0][0] = R[1][1] = R[2][2] = 1;
-                         R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] =
-                                 R[2][1] = 0;
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             int64_t workload_idx = id[0];
+             int idx_i = grid_idx_ptr[workload_idx];
+
+             const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx;
+             const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx;
+
+             float cov[3][3] = {{0}};
+             float U[3][3], V[3][3], S[3];
+
+             int cnt = 0;
+             for (int k = 0; k < 6; ++k) {
+                 if (!mask_nbs[k]) continue;
+                 int idx_k = idx_nbs[k];
+
+                 float diff_ik_init[3] = {
+                         positions_init_ptr[idx_i * 3 + 0] -
+                                 positions_init_ptr[idx_k * 3 + 0],
+                         positions_init_ptr[idx_i * 3 + 1] -
+                                 positions_init_ptr[idx_k * 3 + 1],
+                         positions_init_ptr[idx_i * 3 + 2] -
+                                 positions_init_ptr[idx_k * 3 + 2]};
+                 float diff_ik_curr[3] = {
+                         positions_curr_ptr[idx_i * 3 + 0] -
+                                 positions_curr_ptr[idx_k * 3 + 0],
+                         positions_curr_ptr[idx_i * 3 + 1] -
+                                 positions_curr_ptr[idx_k * 3 + 1],
+                         positions_curr_ptr[idx_i * 3 + 2] -
+                                 positions_curr_ptr[idx_k * 3 + 2]};
+
+                 for (int ii = 0; ii < 3; ++ii) {
+                     for (int jj = 0; jj < 3; ++jj) {
+                         cov[ii][jj] += diff_ik_init[ii] * diff_ik_curr[jj];
                      }
+                 }
+                 ++cnt;
+             }
+
+             if (cnt < 3) return;
+
+             core::linalg::kernel::svd3x3(*cov, *U, S, *V);
+
+             float R[3][3];
+             core::linalg::kernel::transpose3x3_(*U);
+             core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
+
+             float d = core::linalg::kernel::det3x3(*R);
+             if (d < 0) {
+                 U[2][0] = -U[2][0];
+                 U[2][1] = -U[2][1];
+                 U[2][2] = -U[2][2];
+                 core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
+             }
+
+             if (idx_i == anchor_idx) {
+                 R[0][0] = R[1][1] = R[2][2] = 1;
+                 R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] = 0;
+             }
+
+             for (int k = 0; k < 6; ++k) {
+                 if (!mask_nbs[k]) continue;
+                 int idx_k = idx_nbs[k];
+
+                 float diff_ik_init[3] = {
+                         positions_init_ptr[idx_i * 3 + 0] -
+                                 positions_init_ptr[idx_k * 3 + 0],
+                         positions_init_ptr[idx_i * 3 + 1] -
+                                 positions_init_ptr[idx_k * 3 + 1],
+                         positions_init_ptr[idx_i * 3 + 2] -
+                                 positions_init_ptr[idx_k * 3 + 2]};
+                 float diff_ik_curr[3] = {
+                         positions_curr_ptr[idx_i * 3 + 0] -
+                                 positions_curr_ptr[idx_k * 3 + 0],
+                         positions_curr_ptr[idx_i * 3 + 1] -
+                                 positions_curr_ptr[idx_k * 3 + 1],
+                         positions_curr_ptr[idx_i * 3 + 2] -
+                                 positions_curr_ptr[idx_k * 3 + 2]};
+                 float R_diff_ik_curr[3];
+                 core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init,
+                                                     R_diff_ik_curr);
+
+                 float local_r[3];
+                 local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
+                 local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
+                 local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
+
+                 int offset_idx_i = 3 * idx_i + 6 * n_frags;
+                 int offset_idx_k = 3 * idx_k + 6 * n_frags;
+
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(*residual_ptr) +=
+                         weight *
+                         (local_r[0] * local_r[0] + local_r[1] * local_r[1] +
+                          local_r[2] * local_r[2]);
+
+                 for (int axis = 0; axis < 3; ++axis) {
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[(offset_idx_i + axis) * n_vars +
+                                     offset_idx_i + axis]) += weight;
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[(offset_idx_k + axis) * n_vars +
+                                     offset_idx_k + axis]) += weight;
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[(offset_idx_i + axis) * n_vars +
+                                     offset_idx_k + axis]) -= weight;
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[(offset_idx_k + axis) * n_vars +
+                                     offset_idx_i + axis]) -= weight;
 
-                     for (int k = 0; k < 6; ++k) {
-                         if (!mask_nbs[k]) continue;
-                         int idx_k = idx_nbs[k];
-
-                         float diff_ik_init[3] = {
-                                 positions_init_ptr[idx_i * 3 + 0] -
-                                         positions_init_ptr[idx_k * 3 + 0],
-                                 positions_init_ptr[idx_i * 3 + 1] -
-                                         positions_init_ptr[idx_k * 3 + 1],
-                                 positions_init_ptr[idx_i * 3 + 2] -
-                                         positions_init_ptr[idx_k * 3 + 2]};
-                         float diff_ik_curr[3] = {
-                                 positions_curr_ptr[idx_i * 3 + 0] -
-                                         positions_curr_ptr[idx_k * 3 + 0],
-                                 positions_curr_ptr[idx_i * 3 + 1] -
-                                         positions_curr_ptr[idx_k * 3 + 1],
-                                 positions_curr_ptr[idx_i * 3 + 2] -
-                                         positions_curr_ptr[idx_k * 3 + 2]};
-                         float R_diff_ik_curr[3];
-                         core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init,
-                                                              R_diff_ik_curr);
-
-                         float local_r[3];
-                         local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
-                         local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
-                         local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
-
-                         int offset_idx_i = 3 * idx_i + 6 * n_frags;
-                         int offset_idx_k = 3 * idx_k + 6 * n_frags;
-
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 *residual_ptr) +=
-                                 weight * (local_r[0] * local_r[0] +
-                                           local_r[1] * local_r[1] +
-                                           local_r[2] * local_r[2]);
-
-                         for (int axis = 0; axis < 3; ++axis) {
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     AtA_ptr[(offset_idx_i + axis) * n_vars +
-                                             offset_idx_i + axis]) += weight;
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     AtA_ptr[(offset_idx_k + axis) * n_vars +
-                                             offset_idx_k + axis]) += weight;
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     AtA_ptr[(offset_idx_i + axis) * n_vars +
-                                             offset_idx_k + axis]) -= weight;
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     AtA_ptr[(offset_idx_k + axis) * n_vars +
-                                             offset_idx_i + axis]) -= weight;
-
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     Atb_ptr[offset_idx_i + axis]) +=
-                                     weight * local_r[axis];
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     Atb_ptr[offset_idx_k + axis]) -=
-                                     weight * local_r[axis];
-                         }
-                     }
-                 })
-            .wait_and_throw();
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             Atb_ptr[offset_idx_i + axis]) +=
+                             weight * local_r[axis];
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             Atb_ptr[offset_idx_k + axis]) -=
+                             weight * local_r[axis];
+                 }
+             }
+         }).wait_and_throw();
 }
 
 }  // namespace kernel
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
index 6b54fb41961..4a9afaff45f 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
@@ -254,10 +254,9 @@ void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map,
                   square_dist_thr, information);
     } else if (device.GetType() == core::Device::DeviceType::SYCL) {
 #ifdef BUILD_SYCL_MODULE
-        ComputeOdometryInformationMatrixSYCL(source_vertex_map,
-                                             target_vertex_map, intrinsic,
-                                             source_to_target, square_dist_thr,
-                                             information);
+        ComputeOdometryInformationMatrixSYCL(
+                source_vertex_map, target_vertex_map, intrinsic,
+                source_to_target, square_dist_thr, information);
 #else
         utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
index 22955e17ab6..3c65eec81e8 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
@@ -155,29 +155,29 @@ void ComputeOdometryResultIntensitySYCL(
         const float intensity_huber_delta);
 
 void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth,
-                                      const core::Tensor& target_depth,
-                                      const core::Tensor& source_intensity,
-                                      const core::Tensor& target_intensity,
-                                      const core::Tensor& target_depth_dx,
-                                      const core::Tensor& target_depth_dy,
-                                      const core::Tensor& target_intensity_dx,
-                                      const core::Tensor& target_intensity_dy,
-                                      const core::Tensor& source_vertex_map,
-                                      const core::Tensor& intrinsics,
-                                      const core::Tensor& init_source_to_target,
-                                      core::Tensor& delta,
-                                      float& inlier_residual,
-                                      int& inlier_count,
-                                      const float depth_outlier_trunc,
-                                      const float depth_huber_delta,
-                                      const float intensity_huber_delta);
+                                     const core::Tensor& target_depth,
+                                     const core::Tensor& source_intensity,
+                                     const core::Tensor& target_intensity,
+                                     const core::Tensor& target_depth_dx,
+                                     const core::Tensor& target_depth_dy,
+                                     const core::Tensor& target_intensity_dx,
+                                     const core::Tensor& target_intensity_dy,
+                                     const core::Tensor& source_vertex_map,
+                                     const core::Tensor& intrinsics,
+                                     const core::Tensor& init_source_to_target,
+                                     core::Tensor& delta,
+                                     float& inlier_residual,
+                                     int& inlier_count,
+                                     const float depth_outlier_trunc,
+                                     const float depth_huber_delta,
+                                     const float intensity_huber_delta);
 
 void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_depth,
-                                           const core::Tensor& target_depth,
-                                           const core::Tensor& intrinsic,
-                                           const core::Tensor& source_to_target,
-                                           const float square_dist_thr,
-                                           core::Tensor& information);
+                                          const core::Tensor& target_depth,
+                                          const core::Tensor& intrinsic,
+                                          const core::Tensor& source_to_target,
+                                          const float square_dist_thr,
+                                          core::Tensor& information);
 #endif
 
 }  // namespace odometry
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
index f48468763f1..d49ed13b4b1 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
@@ -54,52 +54,42 @@ void ComputeOdometryResultPointToPlaneSYCL(
 
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(
-                 sycl::range<1>{(size_t)n},
-                 [=](sycl::id<1> id) {
-                     const int workload_idx = id[0];
-                     const int y = workload_idx / cols;
-                     const int x = workload_idx % cols;
-
-                     float J[6] = {0};
-                     float r = 0;
-                     const bool valid = GetJacobianPointToPlane(
-                             x, y, depth_outlier_trunc, source_vertex_indexer,
-                             target_vertex_indexer, target_normal_indexer, ti,
-                             J, r);
-
-                     if (valid) {
-                         const float d_huber =
-                                 HuberDeriv(r, depth_huber_delta);
-                         const float r_huber =
-                                 HuberLoss(r, depth_huber_delta);
-
-                         int offset = 0;
-                         for (int i = 0; i < 6; ++i) {
-                             for (int j = 0; j <= i; ++j) {
-                                 sycl::atomic_ref<
-                                         float, sycl::memory_order::acq_rel,
-                                         sycl::memory_scope::device>(
-                                         global_sum_ptr[offset++]) +=
-                                         J[i] * J[j];
-                             }
-                         }
-                         for (int i = 0; i < 6; ++i) {
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     global_sum_ptr[21 + i]) +=
-                                     J[i] * d_huber;
-                         }
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             const int workload_idx = id[0];
+             const int y = workload_idx / cols;
+             const int x = workload_idx % cols;
+
+             float J[6] = {0};
+             float r = 0;
+             const bool valid = GetJacobianPointToPlane(
+                     x, y, depth_outlier_trunc, source_vertex_indexer,
+                     target_vertex_indexer, target_normal_indexer, ti, J, r);
+
+             if (valid) {
+                 const float d_huber = HuberDeriv(r, depth_huber_delta);
+                 const float r_huber = HuberLoss(r, depth_huber_delta);
+
+                 int offset = 0;
+                 for (int i = 0; i < 6; ++i) {
+                     for (int j = 0; j <= i; ++j) {
                          sycl::atomic_ref<float, sycl::memory_order::acq_rel,
                                           sycl::memory_scope::device>(
-                                 global_sum_ptr[27]) += r_huber;
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 global_sum_ptr[28]) += 1.0f;
+                                 global_sum_ptr[offset++]) += J[i] * J[j];
                      }
-                 })
-            .wait_and_throw();
+                 }
+                 for (int i = 0; i < 6; ++i) {
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             global_sum_ptr[21 + i]) += J[i] * d_huber;
+                 }
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         global_sum_ptr[27]) += r_huber;
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         global_sum_ptr[28]) += 1.0f;
+             }
+         }).wait_and_throw();
 
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
@@ -141,77 +131,66 @@ void ComputeOdometryResultIntensitySYCL(
 
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(
-                 sycl::range<1>{(size_t)n},
-                 [=](sycl::id<1> id) {
-                     const int workload_idx = id[0];
-                     const int y = workload_idx / cols;
-                     const int x = workload_idx % cols;
-
-                     float J_I[6] = {0};
-                     float r_I = 0;
-                     const bool valid = GetJacobianIntensity(
-                             x, y, depth_outlier_trunc, source_depth_indexer,
-                             target_depth_indexer, source_intensity_indexer,
-                             target_intensity_indexer,
-                             target_intensity_dx_indexer,
-                             target_intensity_dy_indexer, source_vertex_indexer,
-                             ti, J_I, r_I);
-
-                     if (valid) {
-                         const float d_huber =
-                                 HuberDeriv(r_I, intensity_huber_delta);
-                         const float r_huber =
-                                 HuberLoss(r_I, intensity_huber_delta);
-
-                         int offset = 0;
-                         for (int i = 0; i < 6; ++i) {
-                             for (int j = 0; j <= i; ++j) {
-                                 sycl::atomic_ref<
-                                         float, sycl::memory_order::acq_rel,
-                                         sycl::memory_scope::device>(
-                                         global_sum_ptr[offset++]) +=
-                                         J_I[i] * J_I[j];
-                             }
-                         }
-                         for (int i = 0; i < 6; ++i) {
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     global_sum_ptr[21 + i]) +=
-                                     J_I[i] * d_huber;
-                         }
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 global_sum_ptr[27]) += r_huber;
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             const int workload_idx = id[0];
+             const int y = workload_idx / cols;
+             const int x = workload_idx % cols;
+
+             float J_I[6] = {0};
+             float r_I = 0;
+             const bool valid = GetJacobianIntensity(
+                     x, y, depth_outlier_trunc, source_depth_indexer,
+                     target_depth_indexer, source_intensity_indexer,
+                     target_intensity_indexer, target_intensity_dx_indexer,
+                     target_intensity_dy_indexer, source_vertex_indexer, ti,
+                     J_I, r_I);
+
+             if (valid) {
+                 const float d_huber = HuberDeriv(r_I, intensity_huber_delta);
+                 const float r_huber = HuberLoss(r_I, intensity_huber_delta);
+
+                 int offset = 0;
+                 for (int i = 0; i < 6; ++i) {
+                     for (int j = 0; j <= i; ++j) {
                          sycl::atomic_ref<float, sycl::memory_order::acq_rel,
                                           sycl::memory_scope::device>(
-                                 global_sum_ptr[28]) += 1.0f;
+                                 global_sum_ptr[offset++]) += J_I[i] * J_I[j];
                      }
-                 })
-            .wait_and_throw();
+                 }
+                 for (int i = 0; i < 6; ++i) {
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             global_sum_ptr[21 + i]) += J_I[i] * d_huber;
+                 }
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         global_sum_ptr[27]) += r_huber;
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         global_sum_ptr[28]) += 1.0f;
+             }
+         }).wait_and_throw();
 
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
 
-void ComputeOdometryResultHybridSYCL(
-        const core::Tensor& source_depth,
-        const core::Tensor& target_depth,
-        const core::Tensor& source_intensity,
-        const core::Tensor& target_intensity,
-        const core::Tensor& target_depth_dx,
-        const core::Tensor& target_depth_dy,
-        const core::Tensor& target_intensity_dx,
-        const core::Tensor& target_intensity_dy,
-        const core::Tensor& source_vertex_map,
-        const core::Tensor& intrinsics,
-        const core::Tensor& init_source_to_target,
-        core::Tensor& delta,
-        float& inlier_residual,
-        int& inlier_count,
-        const float depth_outlier_trunc,
-        const float depth_huber_delta,
-        const float intensity_huber_delta) {
+void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth,
+                                     const core::Tensor& target_depth,
+                                     const core::Tensor& source_intensity,
+                                     const core::Tensor& target_intensity,
+                                     const core::Tensor& target_depth_dx,
+                                     const core::Tensor& target_depth_dy,
+                                     const core::Tensor& target_intensity_dx,
+                                     const core::Tensor& target_intensity_dy,
+                                     const core::Tensor& source_vertex_map,
+                                     const core::Tensor& intrinsics,
+                                     const core::Tensor& init_source_to_target,
+                                     core::Tensor& delta,
+                                     float& inlier_residual,
+                                     int& inlier_count,
+                                     const float depth_outlier_trunc,
+                                     const float depth_huber_delta,
+                                     const float intensity_huber_delta) {
     NDArrayIndexer source_depth_indexer(source_depth, 2);
     NDArrayIndexer target_depth_indexer(target_depth, 2);
     NDArrayIndexer source_intensity_indexer(source_intensity, 2);
@@ -236,67 +215,58 @@ void ComputeOdometryResultHybridSYCL(
 
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(
-                 sycl::range<1>{(size_t)n},
-                 [=](sycl::id<1> id) {
-                     const int workload_idx = id[0];
-                     const int y = workload_idx / cols;
-                     const int x = workload_idx % cols;
-
-                     float J_I[6] = {0}, J_D[6] = {0};
-                     float r_I = 0, r_D = 0;
-                     const bool valid = GetJacobianHybrid(
-                             x, y, depth_outlier_trunc, source_depth_indexer,
-                             target_depth_indexer, source_intensity_indexer,
-                             target_intensity_indexer, target_depth_dx_indexer,
-                             target_depth_dy_indexer,
-                             target_intensity_dx_indexer,
-                             target_intensity_dy_indexer, source_vertex_indexer,
-                             ti, J_I, J_D, r_I, r_D);
-
-                     if (valid) {
-                         int offset = 0;
-                         for (int i = 0; i < 6; ++i) {
-                             for (int j = 0; j <= i; ++j) {
-                                 sycl::atomic_ref<
-                                         float, sycl::memory_order::acq_rel,
-                                         sycl::memory_scope::device>(
-                                         global_sum_ptr[offset++]) +=
-                                         J_I[i] * J_I[j] + J_D[i] * J_D[j];
-                             }
-                         }
-                         for (int i = 0; i < 6; ++i) {
-                             sycl::atomic_ref<
-                                     float, sycl::memory_order::acq_rel,
-                                     sycl::memory_scope::device>(
-                                     global_sum_ptr[21 + i]) +=
-                                     J_I[i] *
-                                             HuberDeriv(r_I,
-                                                        intensity_huber_delta) +
-                                     J_D[i] * HuberDeriv(r_D, depth_huber_delta);
-                         }
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             const int workload_idx = id[0];
+             const int y = workload_idx / cols;
+             const int x = workload_idx % cols;
+
+             float J_I[6] = {0}, J_D[6] = {0};
+             float r_I = 0, r_D = 0;
+             const bool valid = GetJacobianHybrid(
+                     x, y, depth_outlier_trunc, source_depth_indexer,
+                     target_depth_indexer, source_intensity_indexer,
+                     target_intensity_indexer, target_depth_dx_indexer,
+                     target_depth_dy_indexer, target_intensity_dx_indexer,
+                     target_intensity_dy_indexer, source_vertex_indexer, ti,
+                     J_I, J_D, r_I, r_D);
+
+             if (valid) {
+                 int offset = 0;
+                 for (int i = 0; i < 6; ++i) {
+                     for (int j = 0; j <= i; ++j) {
                          sycl::atomic_ref<float, sycl::memory_order::acq_rel,
                                           sycl::memory_scope::device>(
-                                 global_sum_ptr[27]) +=
-                                 HuberLoss(r_I, intensity_huber_delta) +
-                                 HuberLoss(r_D, depth_huber_delta);
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 global_sum_ptr[28]) += 1.0f;
+                                 global_sum_ptr[offset++]) +=
+                                 J_I[i] * J_I[j] + J_D[i] * J_D[j];
                      }
-                 })
-            .wait_and_throw();
+                 }
+                 for (int i = 0; i < 6; ++i) {
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             global_sum_ptr[21 + i]) +=
+                             J_I[i] * HuberDeriv(r_I, intensity_huber_delta) +
+                             J_D[i] * HuberDeriv(r_D, depth_huber_delta);
+                 }
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         global_sum_ptr[27]) +=
+                         HuberLoss(r_I, intensity_huber_delta) +
+                         HuberLoss(r_D, depth_huber_delta);
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         global_sum_ptr[28]) += 1.0f;
+             }
+         }).wait_and_throw();
 
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
 
-void ComputeOdometryInformationMatrixSYCL(
-        const core::Tensor& source_vertex_map,
-        const core::Tensor& target_vertex_map,
-        const core::Tensor& intrinsic,
-        const core::Tensor& source_to_target,
-        const float square_dist_thr,
-        core::Tensor& information) {
+void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_vertex_map,
+                                          const core::Tensor& target_vertex_map,
+                                          const core::Tensor& intrinsic,
+                                          const core::Tensor& source_to_target,
+                                          const float square_dist_thr,
+                                          core::Tensor& information) {
     NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
     NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
 
@@ -314,35 +284,30 @@ void ComputeOdometryInformationMatrixSYCL(
 
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(
-                 sycl::range<1>{(size_t)n},
-                 [=](sycl::id<1> id) {
-                     const int workload_idx = id[0];
-                     const int y = workload_idx / cols;
-                     const int x = workload_idx % cols;
-
-                     float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
-                     float rx = 0, ry = 0, rz = 0;
-                     const bool valid = GetJacobianPointToPoint(
-                             x, y, square_dist_thr, source_vertex_indexer,
-                             target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry,
-                             rz);
-
-                     if (valid) {
-                         int offset = 0;
-                         for (int i = 0; i < 6; ++i) {
-                             for (int j = 0; j <= i; ++j) {
-                                 sycl::atomic_ref<
-                                         float, sycl::memory_order::acq_rel,
-                                         sycl::memory_scope::device>(
-                                         global_sum_ptr[offset++]) +=
-                                         J_x[i] * J_x[j] + J_y[i] * J_y[j] +
-                                         J_z[i] * J_z[j];
-                             }
-                         }
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             const int workload_idx = id[0];
+             const int y = workload_idx / cols;
+             const int x = workload_idx % cols;
+
+             float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
+             float rx = 0, ry = 0, rz = 0;
+             const bool valid = GetJacobianPointToPoint(
+                     x, y, square_dist_thr, source_vertex_indexer,
+                     target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, rz);
+
+             if (valid) {
+                 int offset = 0;
+                 for (int i = 0; i < 6; ++i) {
+                     for (int j = 0; j <= i; ++j) {
+                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                          sycl::memory_scope::device>(
+                                 global_sum_ptr[offset++]) += J_x[i] * J_x[j] +
+                                                              J_y[i] * J_y[j] +
+                                                              J_z[i] * J_z[j];
                      }
-                 })
-            .wait_and_throw();
+                 }
+             }
+         }).wait_and_throw();
 
     const core::Device host(core::Device("CPU:0"));
     information = core::Tensor::Empty({6, 6}, core::Float64, host);
diff --git a/cpp/open3d/t/pipelines/kernel/Registration.cpp b/cpp/open3d/t/pipelines/kernel/Registration.cpp
index b3ebf4f5508..95ccddec5c2 100644
--- a/cpp/open3d/t/pipelines/kernel/Registration.cpp
+++ b/cpp/open3d/t/pipelines/kernel/Registration.cpp
@@ -378,10 +378,9 @@ core::Tensor ComputeInformationMatrix(
                   target_points.GetDtype(), device);
     } else if (target_points.IsSYCL()) {
 #ifdef BUILD_SYCL_MODULE
-        ComputeInformationMatrixSYCL(target_points.Contiguous(),
-                                     correspondence_indices.Contiguous(),
-                                     information_matrix,
-                                     target_points.GetDtype(), device);
+        ComputeInformationMatrixSYCL(
+                target_points.Contiguous(), correspondence_indices.Contiguous(),
+                information_matrix, target_points.GetDtype(), device);
 #else
         utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
index f3b65414f88..94acdf773b3 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
@@ -151,30 +151,30 @@ void ComputeInformationMatrixCUDA(const core::Tensor &target_points,
 
 #ifdef BUILD_SYCL_MODULE
 void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
-                                  const core::Tensor &target_points,
-                                  const core::Tensor &target_normals,
-                                  const core::Tensor &correspondence_indices,
-                                  core::Tensor &pose,
-                                  float &residual,
-                                  int &inlier_count,
-                                  const core::Dtype &dtype,
-                                  const core::Device &device,
-                                  const registration::RobustKernel &kernel);
+                                 const core::Tensor &target_points,
+                                 const core::Tensor &target_normals,
+                                 const core::Tensor &correspondence_indices,
+                                 core::Tensor &pose,
+                                 float &residual,
+                                 int &inlier_count,
+                                 const core::Dtype &dtype,
+                                 const core::Device &device,
+                                 const registration::RobustKernel &kernel);
 
 void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
-                                const core::Tensor &source_colors,
-                                const core::Tensor &target_points,
-                                const core::Tensor &target_normals,
-                                const core::Tensor &target_colors,
-                                const core::Tensor &target_color_gradients,
-                                const core::Tensor &correspondence_indices,
-                                core::Tensor &pose,
-                                float &residual,
-                                int &inlier_count,
-                                const core::Dtype &dtype,
-                                const core::Device &device,
-                                const registration::RobustKernel &kernel,
-                                const double &lambda_geometric);
+                               const core::Tensor &source_colors,
+                               const core::Tensor &target_points,
+                               const core::Tensor &target_normals,
+                               const core::Tensor &target_colors,
+                               const core::Tensor &target_color_gradients,
+                               const core::Tensor &correspondence_indices,
+                               core::Tensor &pose,
+                               float &residual,
+                               int &inlier_count,
+                               const core::Dtype &dtype,
+                               const core::Device &device,
+                               const registration::RobustKernel &kernel,
+                               const double &lambda_geometric);
 
 void ComputePoseDopplerICPSYCL(
         const core::Tensor &source_points,
@@ -200,10 +200,10 @@ void ComputePoseDopplerICPSYCL(
         const double lambda_doppler);
 
 void ComputeInformationMatrixSYCL(const core::Tensor &target_points,
-                                   const core::Tensor &correspondence_indices,
-                                   core::Tensor &information_matrix,
-                                   const core::Dtype &dtype,
-                                   const core::Device &device);
+                                  const core::Tensor &correspondence_indices,
+                                  core::Tensor &information_matrix,
+                                  const core::Dtype &dtype,
+                                  const core::Device &device);
 #endif
 
 template <typename scalar_t>
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
index 17bf9768f93..b44c696e7e4 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
@@ -18,19 +18,19 @@ namespace t {
 namespace pipelines {
 namespace kernel {
 
-static constexpr int kReduceDim = 29;  // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier)
-
-void ComputePosePointToPlaneSYCL(
-        const core::Tensor &source_points,
-        const core::Tensor &target_points,
-        const core::Tensor &target_normals,
-        const core::Tensor &correspondence_indices,
-        core::Tensor &pose,
-        float &residual,
-        int &inlier_count,
-        const core::Dtype &dtype,
-        const core::Device &device,
-        const registration::RobustKernel &kernel) {
+static constexpr int kReduceDim =
+        29;  // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier)
+
+void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
+                                 const core::Tensor &target_points,
+                                 const core::Tensor &target_normals,
+                                 const core::Tensor &correspondence_indices,
+                                 core::Tensor &pose,
+                                 float &residual,
+                                 int &inlier_count,
+                                 const core::Dtype &dtype,
+                                 const core::Device &device,
+                                 const registration::RobustKernel &kernel) {
     const int n = source_points.GetLength();
 
     core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
@@ -113,21 +113,20 @@ void ComputePosePointToPlaneSYCL(
     DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
 }
 
-void ComputePoseColoredICPSYCL(
-        const core::Tensor &source_points,
-        const core::Tensor &source_colors,
-        const core::Tensor &target_points,
-        const core::Tensor &target_normals,
-        const core::Tensor &target_colors,
-        const core::Tensor &target_color_gradients,
-        const core::Tensor &correspondence_indices,
-        core::Tensor &pose,
-        float &residual,
-        int &inlier_count,
-        const core::Dtype &dtype,
-        const core::Device &device,
-        const registration::RobustKernel &kernel,
-        const double &lambda_geometric) {
+void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
+                               const core::Tensor &source_colors,
+                               const core::Tensor &target_points,
+                               const core::Tensor &target_normals,
+                               const core::Tensor &target_colors,
+                               const core::Tensor &target_color_gradients,
+                               const core::Tensor &correspondence_indices,
+                               core::Tensor &pose,
+                               float &residual,
+                               int &inlier_count,
+                               const core::Dtype &dtype,
+                               const core::Device &device,
+                               const registration::RobustKernel &kernel,
+                               const double &lambda_geometric) {
     const int n = source_points.GetLength();
 
     core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
@@ -197,8 +196,7 @@ void ComputePoseColoredICPSYCL(
                                                          sycl::memory_scope::
                                                                  device>(
                                                          global_sum_ptr[i]) +=
-                                                         J_G[j] * w_G *
-                                                                 J_G[k] +
+                                                         J_G[j] * w_G * J_G[k] +
                                                          J_I[j] * w_I * J_I[k];
                                                  ++i;
                                              }
@@ -280,8 +278,8 @@ void ComputePoseDopplerICPSYCL(
             scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr<scalar_t>();
             queue.single_task([=]() {
                      PreComputeForDopplerICP(R_S_to_V_ptr, r_v_to_s_in_V_ptr,
-                                            w_v_in_V_ptr, v_v_in_V_ptr,
-                                            v_s_in_S_ptr);
+                                             w_v_in_V_ptr, v_v_in_V_ptr,
+                                             v_s_in_S_ptr);
                  }).wait_and_throw();
         }
 
@@ -317,25 +315,23 @@ void ComputePoseDopplerICPSYCL(
                                      scalar_t J_G[6] = {0}, J_D[6] = {0};
                                      scalar_t r_G = 0, r_D = 0;
 
-                                     const bool valid =
-                                             GetJacobianDopplerICP<scalar_t>(
-                                                     workload_idx,
-                                                     source_points_ptr,
-                                                     source_dopplers_ptr,
-                                                     source_directions_ptr,
-                                                     target_points_ptr,
-                                                     target_normals_ptr,
-                                                     correspondence_indices_ptr,
-                                                     R_S_to_V_ptr,
-                                                     r_v_to_s_in_V_ptr,
-                                                     v_s_in_S_ptr,
-                                                     reject_dynamic_outliers,
-                                                     static_cast<scalar_t>(
-                                                             doppler_outlier_threshold),
-                                                     sqrt_lambda_geometric,
-                                                     sqrt_lambda_doppler,
-                                                     sqrt_lambda_doppler_by_dt,
-                                                     J_G, J_D, r_G, r_D);
+                                     const bool valid = GetJacobianDopplerICP<
+                                             scalar_t>(
+                                             workload_idx, source_points_ptr,
+                                             source_dopplers_ptr,
+                                             source_directions_ptr,
+                                             target_points_ptr,
+                                             target_normals_ptr,
+                                             correspondence_indices_ptr,
+                                             R_S_to_V_ptr, r_v_to_s_in_V_ptr,
+                                             v_s_in_S_ptr,
+                                             reject_dynamic_outliers,
+                                             static_cast<scalar_t>(
+                                                     doppler_outlier_threshold),
+                                             sqrt_lambda_geometric,
+                                             sqrt_lambda_doppler,
+                                             sqrt_lambda_doppler_by_dt, J_G,
+                                             J_D, r_G, r_D);
 
                                      if (valid) {
                                          const scalar_t w_G =
@@ -355,8 +351,7 @@ void ComputePoseDopplerICPSYCL(
                                                          sycl::memory_scope::
                                                                  device>(
                                                          global_sum_ptr[i]) +=
-                                                         J_G[j] * w_G *
-                                                                 J_G[k] +
+                                                         J_G[j] * w_G * J_G[k] +
                                                          J_D[j] * w_D * J_D[k];
                                                  ++i;
                                              }
@@ -391,12 +386,11 @@ void ComputePoseDopplerICPSYCL(
     DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count);
 }
 
-void ComputeInformationMatrixSYCL(
-        const core::Tensor &target_points,
-        const core::Tensor &correspondence_indices,
-        core::Tensor &information_matrix,
-        const core::Dtype &dtype,
-        const core::Device &device) {
+void ComputeInformationMatrixSYCL(const core::Tensor &target_points,
+                                  const core::Tensor &correspondence_indices,
+                                  core::Tensor &information_matrix,
+                                  const core::Dtype &dtype,
+                                  const core::Device &device) {
     const int n = correspondence_indices.GetLength();
 
     core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device);
@@ -411,33 +405,28 @@ void ComputeInformationMatrixSYCL(
         const int64_t *correspondence_indices_ptr =
                 correspondence_indices.GetDataPtr<int64_t>();
 
-        queue.parallel_for(
-                     sycl::range<1>{(size_t)n},
-                     [=](sycl::id<1> id) {
-                         const int workload_idx = id[0];
-                         scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
-                         const bool valid = GetInformationJacobians<scalar_t>(
-                                 workload_idx, target_points_ptr,
-                                 correspondence_indices_ptr, J_x, J_y, J_z);
-
-                         if (valid) {
-                             int i = 0;
-                             for (int j = 0; j < 6; ++j) {
-                                 for (int k = 0; k <= j; ++k) {
-                                     sycl::atomic_ref<
-                                             scalar_t,
-                                             sycl::memory_order::acq_rel,
-                                             sycl::memory_scope::device>(
-                                             global_sum_ptr[i]) +=
-                                             J_x[j] * J_x[k] +
-                                             J_y[j] * J_y[k] +
-                                             J_z[j] * J_z[k];
-                                     ++i;
-                                 }
-                             }
+        queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+                 const int workload_idx = id[0];
+                 scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
+                 const bool valid = GetInformationJacobians<scalar_t>(
+                         workload_idx, target_points_ptr,
+                         correspondence_indices_ptr, J_x, J_y, J_z);
+
+                 if (valid) {
+                     int i = 0;
+                     for (int j = 0; j < 6; ++j) {
+                         for (int k = 0; k <= j; ++k) {
+                             sycl::atomic_ref<scalar_t,
+                                              sycl::memory_order::acq_rel,
+                                              sycl::memory_scope::device>(
+                                     global_sum_ptr[i]) += J_x[j] * J_x[k] +
+                                                           J_y[j] * J_y[k] +
+                                                           J_z[j] * J_z[k];
+                             ++i;
                          }
-                     })
-                .wait_and_throw();
+                     }
+                 }
+             }).wait_and_throw();
 
         const core::Device host(core::Device("CPU:0"));
         core::Tensor global_sum_cpu = global_sum.To(host, core::Float64);
diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
index 02cc741d62e..6ee52ee8ab5 100644
--- a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
@@ -84,8 +84,8 @@ void TransformationToPoseCUDA(scalar_t *pose_ptr,
 /// the parent function PoseToTransformation.
 template <typename scalar_t>
 void PoseToTransformationSYCL(scalar_t *transformation_ptr,
-                               const scalar_t *pose_ptr,
-                               const core::Device &device);
+                              const scalar_t *pose_ptr,
+                              const core::Device &device);
 
 /// \brief Helper function for TransformationToPoseSYCL.
 /// Do not call this independently, as it only sets the rotation part in the
@@ -93,8 +93,8 @@ void PoseToTransformationSYCL(scalar_t *transformation_ptr,
 /// TransformationToPose.
 template <typename scalar_t>
 void TransformationToPoseSYCL(scalar_t *pose_ptr,
-                               const scalar_t *transformation_ptr,
-                               const core::Device &device);
+                              const scalar_t *transformation_ptr,
+                              const core::Device &device);
 #endif
 
 }  // namespace kernel
diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp
index 1ec3e943a3c..13eb4490465 100644
--- a/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp
@@ -17,8 +17,8 @@ namespace kernel {
 
 template <typename scalar_t>
 void PoseToTransformationSYCL(scalar_t *transformation_ptr,
-                               const scalar_t *pose_ptr,
-                               const core::Device &device) {
+                              const scalar_t *pose_ptr,
+                              const core::Device &device) {
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
     queue.single_task([=]() {
@@ -28,8 +28,8 @@ void PoseToTransformationSYCL(scalar_t *transformation_ptr,
 
 template <typename scalar_t>
 void TransformationToPoseSYCL(scalar_t *pose_ptr,
-                               const scalar_t *transformation_ptr,
-                               const core::Device &device) {
+                              const scalar_t *transformation_ptr,
+                              const core::Device &device) {
     sycl::queue queue =
             core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
     queue.single_task([=]() {

From 5c3b28c3cf3f2b76da8705105d0bffad7a01902c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 13 Mar 2026 17:41:01 +0000
Subject: [PATCH 06/14] Update t/pipelines tests to use PermuteDevicesWithSYCL
 for SYCL device coverage

Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com>
---
 cpp/tests/t/pipelines/TransformationConverter.cpp             | 4 ++--
 cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp               | 4 ++--
 cpp/tests/t/pipelines/registration/Feature.cpp                | 4 ++--
 cpp/tests/t/pipelines/registration/Registration.cpp           | 4 ++--
 .../t/pipelines/registration/TransformationEstimation.cpp     | 4 ++--
 cpp/tests/t/pipelines/slac/ControlGrid.cpp                    | 4 ++--
 cpp/tests/t/pipelines/slac/SLAC.cpp                           | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/tests/t/pipelines/TransformationConverter.cpp b/cpp/tests/t/pipelines/TransformationConverter.cpp
index 5d8bd0d8434..8036b55a08f 100644
--- a/cpp/tests/t/pipelines/TransformationConverter.cpp
+++ b/cpp/tests/t/pipelines/TransformationConverter.cpp
@@ -14,10 +14,10 @@
 namespace open3d {
 namespace tests {
 
-class TransformationConverterPermuteDevices : public PermuteDevices {};
+class TransformationConverterPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(TransformationConverter,
                          TransformationConverterPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(TransformationConverterPermuteDevices, RtToTransformation) {
     core::Device device = GetParam();
diff --git a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
index 3bb7ee36142..7d1b8b6466b 100644
--- a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
+++ b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
@@ -21,10 +21,10 @@
 namespace open3d {
 namespace tests {
 
-class OdometryPermuteDevices : public PermuteDevices {};
+class OdometryPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(Odometry,
                          OdometryPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 core::Tensor CreateIntrisicTensor() {
     camera::PinholeCameraIntrinsic intrinsic = camera::PinholeCameraIntrinsic(
diff --git a/cpp/tests/t/pipelines/registration/Feature.cpp b/cpp/tests/t/pipelines/registration/Feature.cpp
index a3741a3ed8a..1266ab7f705 100644
--- a/cpp/tests/t/pipelines/registration/Feature.cpp
+++ b/cpp/tests/t/pipelines/registration/Feature.cpp
@@ -20,10 +20,10 @@
 namespace open3d {
 namespace tests {
 
-class FeaturePermuteDevices : public PermuteDevices {};
+class FeaturePermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(Feature,
                          FeaturePermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(FeaturePermuteDevices, SelectByIndex) {
     core::Device device = GetParam();
diff --git a/cpp/tests/t/pipelines/registration/Registration.cpp b/cpp/tests/t/pipelines/registration/Registration.cpp
index d823394c6bf..76b31da3dd9 100644
--- a/cpp/tests/t/pipelines/registration/Registration.cpp
+++ b/cpp/tests/t/pipelines/registration/Registration.cpp
@@ -26,10 +26,10 @@ namespace l_reg = open3d::pipelines::registration;
 namespace open3d {
 namespace tests {
 
-class RegistrationPermuteDevices : public PermuteDevices {};
+class RegistrationPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(Registration,
                          RegistrationPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(RegistrationPermuteDevices, ICPConvergenceCriteriaConstructor) {
     // Constructor.
diff --git a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
index 2daca846d0a..c98ab268529 100644
--- a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
+++ b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
@@ -15,10 +15,10 @@
 namespace open3d {
 namespace tests {
 
-class TransformationEstimationPermuteDevices : public PermuteDevices {};
+class TransformationEstimationPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(TransformationEstimation,
                          TransformationEstimationPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 static std::
         tuple<t::geometry::PointCloud, t::geometry::PointCloud, core::Tensor>
diff --git a/cpp/tests/t/pipelines/slac/ControlGrid.cpp b/cpp/tests/t/pipelines/slac/ControlGrid.cpp
index 98bce9d5ef2..5e8e5d59949 100644
--- a/cpp/tests/t/pipelines/slac/ControlGrid.cpp
+++ b/cpp/tests/t/pipelines/slac/ControlGrid.cpp
@@ -24,10 +24,10 @@ static t::geometry::PointCloud CreateTPCDFromFile(
     return t::geometry::PointCloud::FromLegacy(*pcd, core::Float32, device);
 }
 
-class ControlGridPermuteDevices : public PermuteDevices {};
+class ControlGridPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(ControlGrid,
                          ControlGridPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 // TODO(wei): more well-designed test cases
 TEST_P(ControlGridPermuteDevices, Touch) {
diff --git a/cpp/tests/t/pipelines/slac/SLAC.cpp b/cpp/tests/t/pipelines/slac/SLAC.cpp
index bc31cb31044..73aef1ad212 100644
--- a/cpp/tests/t/pipelines/slac/SLAC.cpp
+++ b/cpp/tests/t/pipelines/slac/SLAC.cpp
@@ -27,10 +27,10 @@
 namespace open3d {
 namespace tests {
 
-class SLACPermuteDevices : public PermuteDevices {};
+class SLACPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(SLAC,
                          SLACPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 // PointCloud is similar if fitness is higher and rmse is lower than tolerance
 // threshold.

From 4cbdf15a900b822f4c5aeb0f7fa435e43218ddfb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 13 Mar 2026 17:57:40 +0000
Subject: [PATCH 07/14] Optimize SYCL pipeline kernels: use nd_range +
 reduce_over_group instead of plain global atomics

Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com>
---
 .../kernel/FillInLinearSystemSYCL.cpp         | 144 ++++--
 .../t/pipelines/kernel/RGBDOdometrySYCL.cpp   | 463 +++++++++--------
 .../t/pipelines/kernel/RegistrationSYCL.cpp   | 466 ++++++++++--------
 3 files changed, 616 insertions(+), 457 deletions(-)

diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
index 2f16deec4e8..c1c524005a7 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
@@ -36,7 +36,12 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
                 "Unable to setup linear system: input length mismatch.");
     }
 
-    // First fill in a small 12 x 12 linear system.
+    // First fill in a small 12 x 12 linear system using group reduction.
+    // kLocalDim = 12*12 + 12 + 1 = 157 elements.
+    static constexpr int kLocalDim12 = 144;  // 12*12
+    static constexpr int kLocalDimAtb = 12;
+    static constexpr int kLocalDimTotal = kLocalDim12 + kLocalDimAtb + 1;
+
     core::Tensor AtA_local =
             core::Tensor::Zeros({12, 12}, core::Float32, device);
     core::Tensor Atb_local = core::Tensor::Zeros({12}, core::Float32, device);
@@ -50,47 +55,102 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
     const float *Ri_normal_ps_ptr =
             static_cast<const float *>(Ri_normal_ps.GetDataPtr());
 
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
-             int64_t workload_idx = id[0];
-             const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
-             const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
-             const float *normal_p_prime = Ri_normal_ps_ptr + 3 * workload_idx;
-
-             float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] +
-                       (p_prime[1] - q_prime[1]) * normal_p_prime[1] +
-                       (p_prime[2] - q_prime[2]) * normal_p_prime[2];
-             if (sycl::fabs(r) > threshold) return;
-
-             float J_ij[12];
-             J_ij[0] = -q_prime[2] * normal_p_prime[1] +
-                       q_prime[1] * normal_p_prime[2];
-             J_ij[1] = q_prime[2] * normal_p_prime[0] -
-                       q_prime[0] * normal_p_prime[2];
-             J_ij[2] = -q_prime[1] * normal_p_prime[0] +
-                       q_prime[0] * normal_p_prime[1];
-             J_ij[3] = normal_p_prime[0];
-             J_ij[4] = normal_p_prime[1];
-             J_ij[5] = normal_p_prime[2];
-             for (int k = 0; k < 6; ++k) {
-                 J_ij[k + 6] = -J_ij[k];
-             }
-
-             for (int i_local = 0; i_local < 12; ++i_local) {
-                 for (int j_local = 0; j_local < 12; ++j_local) {
-                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                      sycl::memory_scope::device>(
-                             AtA_local_ptr[i_local * 12 + j_local]) +=
-                             J_ij[i_local] * J_ij[j_local];
-                 }
-                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                  sycl::memory_scope::device>(
-                         Atb_local_ptr[i_local]) += J_ij[i_local] * r;
-             }
-             sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                              sycl::memory_scope::device>(*residual_ptr) +=
-                     r * r;
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         // Private accumulation buffer: [AtA_local(144) |
+                         // Atb_local(12) | residual(1)]
+                         float local_sum[kLocalDimTotal] = {};
+
+                         if (gid < n) {
+                             const float *p_prime = Ti_ps_ptr + 3 * gid;
+                             const float *q_prime = Tj_qs_ptr + 3 * gid;
+                             const float *normal_p_prime =
+                                     Ri_normal_ps_ptr + 3 * gid;
+
+                             float r =
+                                     (p_prime[0] - q_prime[0]) *
+                                             normal_p_prime[0] +
+                                     (p_prime[1] - q_prime[1]) *
+                                             normal_p_prime[1] +
+                                     (p_prime[2] - q_prime[2]) *
+                                             normal_p_prime[2];
+
+                             if (sycl::fabs(r) <= threshold) {
+                                 float J_ij[12];
+                                 J_ij[0] = -q_prime[2] * normal_p_prime[1] +
+                                           q_prime[1] * normal_p_prime[2];
+                                 J_ij[1] = q_prime[2] * normal_p_prime[0] -
+                                           q_prime[0] * normal_p_prime[2];
+                                 J_ij[2] = -q_prime[1] * normal_p_prime[0] +
+                                           q_prime[0] * normal_p_prime[1];
+                                 J_ij[3] = normal_p_prime[0];
+                                 J_ij[4] = normal_p_prime[1];
+                                 J_ij[5] = normal_p_prime[2];
+                                 for (int k = 0; k < 6; ++k) {
+                                     J_ij[k + 6] = -J_ij[k];
+                                 }
+
+                                 for (int i_local = 0; i_local < 12;
+                                      ++i_local) {
+                                     for (int j_local = 0; j_local < 12;
+                                          ++j_local) {
+                                         local_sum[i_local * 12 + j_local] +=
+                                                 J_ij[i_local] * J_ij[j_local];
+                                     }
+                                     local_sum[kLocalDim12 + i_local] +=
+                                             J_ij[i_local] * r;
+                                 }
+                                 local_sum[kLocalDim12 + kLocalDimAtb] +=
+                                         r * r;
+                             }
+                         }
+
+                         // Group-reduce each element and atomically add to
+                         // the device-side local system buffers.
+                         auto grp = item.get_group();
+                         for (int k = 0; k < kLocalDim12; ++k) {
+                             float gv = sycl::reduce_over_group(
+                                     grp, local_sum[k], sycl::plus<float>{});
+                             if (item.get_local_id(0) == 0) {
+                                 sycl::atomic_ref<
+                                         float, sycl::memory_order::acq_rel,
+                                         sycl::memory_scope::device>(
+                                         AtA_local_ptr[k]) += gv;
+                             }
+                         }
+                         for (int k = 0; k < kLocalDimAtb; ++k) {
+                             float gv = sycl::reduce_over_group(
+                                     grp, local_sum[kLocalDim12 + k],
+                                     sycl::plus<float>{});
+                             if (item.get_local_id(0) == 0) {
+                                 sycl::atomic_ref<
+                                         float, sycl::memory_order::acq_rel,
+                                         sycl::memory_scope::device>(
+                                         Atb_local_ptr[k]) += gv;
+                             }
+                         }
+                         {
+                             float gv = sycl::reduce_over_group(
+                                     grp,
+                                     local_sum[kLocalDim12 + kLocalDimAtb],
+                                     sycl::plus<float>{});
+                             if (item.get_local_id(0) == 0) {
+                                 sycl::atomic_ref<
+                                         float, sycl::memory_order::acq_rel,
+                                         sycl::memory_scope::device>(
+                                         *residual_ptr) += gv;
+                             }
+                         }
+                     });
          }).wait_and_throw();
 
     // Then fill-in the large linear system.
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
index d49ed13b4b1..59181b6d54c 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
@@ -25,15 +25,34 @@ using t::geometry::kernel::TransformIndexer;
 static constexpr int kReduceDimOdometry = 29;
 static constexpr int kJtJDimOdometry = 21;
 
+/// Perform a group reduction over a fixed-size private float array using
+/// sycl::reduce_over_group, then have work item 0 atomically accumulate each
+/// group partial sum into the device-side global buffer.
+template <int N>
+static inline void GroupReduceAndAdd(sycl::nd_item<1> item,
+                                     const float (&local_sum)[N],
+                                     float *global_sum_ptr) {
+    auto grp = item.get_group();
+    for (int k = 0; k < N; ++k) {
+        float grp_val = sycl::reduce_over_group(grp, local_sum[k],
+                                                sycl::plus<float>{});
+        if (item.get_local_id(0) == 0) {
+            sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                             sycl::memory_scope::device>(global_sum_ptr[k]) +=
+                    grp_val;
+        }
+    }
+}
+
 void ComputeOdometryResultPointToPlaneSYCL(
-        const core::Tensor& source_vertex_map,
-        const core::Tensor& target_vertex_map,
-        const core::Tensor& target_normal_map,
-        const core::Tensor& intrinsics,
-        const core::Tensor& init_source_to_target,
-        core::Tensor& delta,
-        float& inlier_residual,
-        int& inlier_count,
+        const core::Tensor &source_vertex_map,
+        const core::Tensor &target_vertex_map,
+        const core::Tensor &target_normal_map,
+        const core::Tensor &intrinsics,
+        const core::Tensor &init_source_to_target,
+        core::Tensor &delta,
+        float &inlier_residual,
+        int &inlier_count,
         const float depth_outlier_trunc,
         const float depth_huber_delta) {
     NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
@@ -50,63 +69,71 @@ void ComputeOdometryResultPointToPlaneSYCL(
 
     core::Tensor global_sum =
             core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
-    float* global_sum_ptr = global_sum.GetDataPtr<float>();
-
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
-             const int workload_idx = id[0];
-             const int y = workload_idx / cols;
-             const int x = workload_idx % cols;
-
-             float J[6] = {0};
-             float r = 0;
-             const bool valid = GetJacobianPointToPlane(
-                     x, y, depth_outlier_trunc, source_vertex_indexer,
-                     target_vertex_indexer, target_normal_indexer, ti, J, r);
-
-             if (valid) {
-                 const float d_huber = HuberDeriv(r, depth_huber_delta);
-                 const float r_huber = HuberLoss(r, depth_huber_delta);
-
-                 int offset = 0;
-                 for (int i = 0; i < 6; ++i) {
-                     for (int j = 0; j <= i; ++j) {
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 global_sum_ptr[offset++]) += J[i] * J[j];
-                     }
-                 }
-                 for (int i = 0; i < 6; ++i) {
-                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                      sycl::memory_scope::device>(
-                             global_sum_ptr[21 + i]) += J[i] * d_huber;
-                 }
-                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                  sycl::memory_scope::device>(
-                         global_sum_ptr[27]) += r_huber;
-                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                  sycl::memory_scope::device>(
-                         global_sum_ptr[28]) += 1.0f;
-             }
+    float *global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         float local_sum[kReduceDimOdometry] = {};
+
+                         if (gid < n) {
+                             const int y = gid / cols;
+                             const int x = gid % cols;
+
+                             float J[6] = {0};
+                             float r = 0;
+                             const bool valid = GetJacobianPointToPlane(
+                                     x, y, depth_outlier_trunc,
+                                     source_vertex_indexer,
+                                     target_vertex_indexer,
+                                     target_normal_indexer, ti, J, r);
+
+                             if (valid) {
+                                 const float d_huber =
+                                         HuberDeriv(r, depth_huber_delta);
+                                 const float r_huber =
+                                         HuberLoss(r, depth_huber_delta);
+                                 int offset = 0;
+                                 for (int i = 0; i < 6; ++i) {
+                                     for (int j = 0; j <= i; ++j) {
+                                         local_sum[offset++] += J[i] * J[j];
+                                     }
+                                     local_sum[21 + i] += J[i] * d_huber;
+                                 }
+                                 local_sum[27] += r_huber;
+                                 local_sum[28] += 1.0f;
+                             }
+                         }
+
+                         GroupReduceAndAdd<kReduceDimOdometry>(
+                                 item, local_sum, global_sum_ptr);
+                     });
          }).wait_and_throw();
 
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
 
 void ComputeOdometryResultIntensitySYCL(
-        const core::Tensor& source_depth,
-        const core::Tensor& target_depth,
-        const core::Tensor& source_intensity,
-        const core::Tensor& target_intensity,
-        const core::Tensor& target_intensity_dx,
-        const core::Tensor& target_intensity_dy,
-        const core::Tensor& source_vertex_map,
-        const core::Tensor& intrinsics,
-        const core::Tensor& init_source_to_target,
-        core::Tensor& delta,
-        float& inlier_residual,
-        int& inlier_count,
+        const core::Tensor &source_depth,
+        const core::Tensor &target_depth,
+        const core::Tensor &source_intensity,
+        const core::Tensor &target_intensity,
+        const core::Tensor &target_intensity_dx,
+        const core::Tensor &target_intensity_dy,
+        const core::Tensor &source_vertex_map,
+        const core::Tensor &intrinsics,
+        const core::Tensor &init_source_to_target,
+        core::Tensor &delta,
+        float &inlier_residual,
+        int &inlier_count,
         const float depth_outlier_trunc,
         const float intensity_huber_delta) {
     NDArrayIndexer source_depth_indexer(source_depth, 2);
@@ -127,70 +154,80 @@ void ComputeOdometryResultIntensitySYCL(
 
     core::Tensor global_sum =
             core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
-    float* global_sum_ptr = global_sum.GetDataPtr<float>();
-
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
-             const int workload_idx = id[0];
-             const int y = workload_idx / cols;
-             const int x = workload_idx % cols;
-
-             float J_I[6] = {0};
-             float r_I = 0;
-             const bool valid = GetJacobianIntensity(
-                     x, y, depth_outlier_trunc, source_depth_indexer,
-                     target_depth_indexer, source_intensity_indexer,
-                     target_intensity_indexer, target_intensity_dx_indexer,
-                     target_intensity_dy_indexer, source_vertex_indexer, ti,
-                     J_I, r_I);
-
-             if (valid) {
-                 const float d_huber = HuberDeriv(r_I, intensity_huber_delta);
-                 const float r_huber = HuberLoss(r_I, intensity_huber_delta);
-
-                 int offset = 0;
-                 for (int i = 0; i < 6; ++i) {
-                     for (int j = 0; j <= i; ++j) {
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 global_sum_ptr[offset++]) += J_I[i] * J_I[j];
-                     }
-                 }
-                 for (int i = 0; i < 6; ++i) {
-                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                      sycl::memory_scope::device>(
-                             global_sum_ptr[21 + i]) += J_I[i] * d_huber;
-                 }
-                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                  sycl::memory_scope::device>(
-                         global_sum_ptr[27]) += r_huber;
-                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                  sycl::memory_scope::device>(
-                         global_sum_ptr[28]) += 1.0f;
-             }
+    float *global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         float local_sum[kReduceDimOdometry] = {};
+
+                         if (gid < n) {
+                             const int y = gid / cols;
+                             const int x = gid % cols;
+
+                             float J_I[6] = {0};
+                             float r_I = 0;
+                             const bool valid = GetJacobianIntensity(
+                                     x, y, depth_outlier_trunc,
+                                     source_depth_indexer, target_depth_indexer,
+                                     source_intensity_indexer,
+                                     target_intensity_indexer,
+                                     target_intensity_dx_indexer,
+                                     target_intensity_dy_indexer,
+                                     source_vertex_indexer, ti, J_I, r_I);
+
+                             if (valid) {
+                                 const float d_huber =
+                                         HuberDeriv(r_I, intensity_huber_delta);
+                                 const float r_huber =
+                                         HuberLoss(r_I, intensity_huber_delta);
+                                 int offset = 0;
+                                 for (int i = 0; i < 6; ++i) {
+                                     for (int j = 0; j <= i; ++j) {
+                                         local_sum[offset++] +=
+                                                 J_I[i] * J_I[j];
+                                     }
+                                     local_sum[21 + i] += J_I[i] * d_huber;
+                                 }
+                                 local_sum[27] += r_huber;
+                                 local_sum[28] += 1.0f;
+                             }
+                         }
+
+                         GroupReduceAndAdd<kReduceDimOdometry>(
+                                 item, local_sum, global_sum_ptr);
+                     });
          }).wait_and_throw();
 
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
 
-void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth,
-                                     const core::Tensor& target_depth,
-                                     const core::Tensor& source_intensity,
-                                     const core::Tensor& target_intensity,
-                                     const core::Tensor& target_depth_dx,
-                                     const core::Tensor& target_depth_dy,
-                                     const core::Tensor& target_intensity_dx,
-                                     const core::Tensor& target_intensity_dy,
-                                     const core::Tensor& source_vertex_map,
-                                     const core::Tensor& intrinsics,
-                                     const core::Tensor& init_source_to_target,
-                                     core::Tensor& delta,
-                                     float& inlier_residual,
-                                     int& inlier_count,
-                                     const float depth_outlier_trunc,
-                                     const float depth_huber_delta,
-                                     const float intensity_huber_delta) {
+void ComputeOdometryResultHybridSYCL(
+        const core::Tensor &source_depth,
+        const core::Tensor &target_depth,
+        const core::Tensor &source_intensity,
+        const core::Tensor &target_intensity,
+        const core::Tensor &target_depth_dx,
+        const core::Tensor &target_depth_dy,
+        const core::Tensor &target_intensity_dx,
+        const core::Tensor &target_intensity_dy,
+        const core::Tensor &source_vertex_map,
+        const core::Tensor &intrinsics,
+        const core::Tensor &init_source_to_target,
+        core::Tensor &delta,
+        float &inlier_residual,
+        int &inlier_count,
+        const float depth_outlier_trunc,
+        const float depth_huber_delta,
+        const float intensity_huber_delta) {
     NDArrayIndexer source_depth_indexer(source_depth, 2);
     NDArrayIndexer target_depth_indexer(target_depth, 2);
     NDArrayIndexer source_intensity_indexer(source_intensity, 2);
@@ -211,62 +248,76 @@ void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth,
 
     core::Tensor global_sum =
             core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
-    float* global_sum_ptr = global_sum.GetDataPtr<float>();
-
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
-             const int workload_idx = id[0];
-             const int y = workload_idx / cols;
-             const int x = workload_idx % cols;
-
-             float J_I[6] = {0}, J_D[6] = {0};
-             float r_I = 0, r_D = 0;
-             const bool valid = GetJacobianHybrid(
-                     x, y, depth_outlier_trunc, source_depth_indexer,
-                     target_depth_indexer, source_intensity_indexer,
-                     target_intensity_indexer, target_depth_dx_indexer,
-                     target_depth_dy_indexer, target_intensity_dx_indexer,
-                     target_intensity_dy_indexer, source_vertex_indexer, ti,
-                     J_I, J_D, r_I, r_D);
-
-             if (valid) {
-                 int offset = 0;
-                 for (int i = 0; i < 6; ++i) {
-                     for (int j = 0; j <= i; ++j) {
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 global_sum_ptr[offset++]) +=
-                                 J_I[i] * J_I[j] + J_D[i] * J_D[j];
-                     }
-                 }
-                 for (int i = 0; i < 6; ++i) {
-                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                      sycl::memory_scope::device>(
-                             global_sum_ptr[21 + i]) +=
-                             J_I[i] * HuberDeriv(r_I, intensity_huber_delta) +
-                             J_D[i] * HuberDeriv(r_D, depth_huber_delta);
-                 }
-                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                  sycl::memory_scope::device>(
-                         global_sum_ptr[27]) +=
-                         HuberLoss(r_I, intensity_huber_delta) +
-                         HuberLoss(r_D, depth_huber_delta);
-                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                  sycl::memory_scope::device>(
-                         global_sum_ptr[28]) += 1.0f;
-             }
+    float *global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         float local_sum[kReduceDimOdometry] = {};
+
+                         if (gid < n) {
+                             const int y = gid / cols;
+                             const int x = gid % cols;
+
+                             float J_I[6] = {0}, J_D[6] = {0};
+                             float r_I = 0, r_D = 0;
+                             const bool valid = GetJacobianHybrid(
+                                     x, y, depth_outlier_trunc,
+                                     source_depth_indexer, target_depth_indexer,
+                                     source_intensity_indexer,
+                                     target_intensity_indexer,
+                                     target_depth_dx_indexer,
+                                     target_depth_dy_indexer,
+                                     target_intensity_dx_indexer,
+                                     target_intensity_dy_indexer,
+                                     source_vertex_indexer, ti, J_I, J_D, r_I,
+                                     r_D);
+
+                             if (valid) {
+                                 int offset = 0;
+                                 for (int i = 0; i < 6; ++i) {
+                                     for (int j = 0; j <= i; ++j) {
+                                         local_sum[offset++] +=
+                                                 J_I[i] * J_I[j] +
+                                                 J_D[i] * J_D[j];
+                                     }
+                                     local_sum[21 + i] +=
+                                             J_I[i] * HuberDeriv(
+                                                              r_I,
+                                                              intensity_huber_delta) +
+                                             J_D[i] * HuberDeriv(r_D,
+                                                                  depth_huber_delta);
+                                 }
+                                 local_sum[27] +=
+                                         HuberLoss(r_I, intensity_huber_delta) +
+                                         HuberLoss(r_D, depth_huber_delta);
+                                 local_sum[28] += 1.0f;
+                             }
+                         }
+
+                         GroupReduceAndAdd<kReduceDimOdometry>(
+                                 item, local_sum, global_sum_ptr);
+                     });
          }).wait_and_throw();
 
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
 
-void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_vertex_map,
-                                          const core::Tensor& target_vertex_map,
-                                          const core::Tensor& intrinsic,
-                                          const core::Tensor& source_to_target,
-                                          const float square_dist_thr,
-                                          core::Tensor& information) {
+void ComputeOdometryInformationMatrixSYCL(
+        const core::Tensor &source_vertex_map,
+        const core::Tensor &target_vertex_map,
+        const core::Tensor &intrinsic,
+        const core::Tensor &source_to_target,
+        const float square_dist_thr,
+        core::Tensor &information) {
     NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
     NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
 
@@ -280,41 +331,57 @@ void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_vertex_map,
 
     core::Tensor global_sum =
             core::Tensor::Zeros({kJtJDimOdometry}, core::Float32, device);
-    float* global_sum_ptr = global_sum.GetDataPtr<float>();
-
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
-             const int workload_idx = id[0];
-             const int y = workload_idx / cols;
-             const int x = workload_idx % cols;
-
-             float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
-             float rx = 0, ry = 0, rz = 0;
-             const bool valid = GetJacobianPointToPoint(
-                     x, y, square_dist_thr, source_vertex_indexer,
-                     target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, rz);
-
-             if (valid) {
-                 int offset = 0;
-                 for (int i = 0; i < 6; ++i) {
-                     for (int j = 0; j <= i; ++j) {
-                         sycl::atomic_ref<float, sycl::memory_order::acq_rel,
-                                          sycl::memory_scope::device>(
-                                 global_sum_ptr[offset++]) += J_x[i] * J_x[j] +
-                                                              J_y[i] * J_y[j] +
-                                                              J_z[i] * J_z[j];
-                     }
-                 }
-             }
+    float *global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         float local_sum[kJtJDimOdometry] = {};
+
+                         if (gid < n) {
+                             const int y = gid / cols;
+                             const int x = gid % cols;
+
+                             float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
+                             float rx = 0, ry = 0, rz = 0;
+                             const bool valid = GetJacobianPointToPoint(
+                                     x, y, square_dist_thr,
+                                     source_vertex_indexer,
+                                     target_vertex_indexer, ti, J_x, J_y, J_z,
+                                     rx, ry, rz);
+
+                             if (valid) {
+                                 int offset = 0;
+                                 for (int i = 0; i < 6; ++i) {
+                                     for (int j = 0; j <= i; ++j) {
+                                         local_sum[offset++] +=
+                                                 J_x[i] * J_x[j] +
+                                                 J_y[i] * J_y[j] +
+                                                 J_z[i] * J_z[j];
+                                     }
+                                 }
+                             }
+                         }
+
+                         GroupReduceAndAdd<kJtJDimOdometry>(item, local_sum,
+                                                            global_sum_ptr);
+                     });
          }).wait_and_throw();
 
     const core::Device host(core::Device("CPU:0"));
     information = core::Tensor::Empty({6, 6}, core::Float64, host);
     global_sum = global_sum.To(host, core::Float64);
 
-    double* info_ptr = information.GetDataPtr<double>();
-    double* reduction_ptr = global_sum.GetDataPtr<double>();
+    double *info_ptr = information.GetDataPtr<double>();
+    double *reduction_ptr = global_sum.GetDataPtr<double>();
     for (int j = 0; j < 6; j++) {
         const int64_t reduction_idx = ((j * (j + 1)) / 2);
         for (int k = 0; k <= j; k++) {
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
index b44c696e7e4..80f4b2a4f73 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
@@ -21,6 +21,25 @@ namespace kernel {
 static constexpr int kReduceDim =
         29;  // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier)
 
+/// Perform a group reduction over a fixed-size private array using
+/// sycl::reduce_over_group, then have work item 0 atomically accumulate each
+/// group partial sum into the device-side global buffer.
+template <int N, typename scalar_t>
+static inline void GroupReduceAndAdd(sycl::nd_item<1> item,
+                                     const scalar_t (&local_sum)[N],
+                                     scalar_t *global_sum_ptr) {
+    auto grp = item.get_group();
+    for (int k = 0; k < N; ++k) {
+        scalar_t grp_val = sycl::reduce_over_group(grp, local_sum[k],
+                                                   sycl::plus<scalar_t>{});
+        if (item.get_local_id(0) == 0) {
+            sycl::atomic_ref<scalar_t, sycl::memory_order::acq_rel,
+                             sycl::memory_scope::device>(global_sum_ptr[k]) +=
+                    grp_val;
+        }
+    }
+}
+
 void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
                                  const core::Tensor &target_points,
                                  const core::Tensor &target_normals,
@@ -35,8 +54,11 @@ void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
 
     core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
 
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
 
     DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
         scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
@@ -53,60 +75,54 @@ void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
                     const int64_t *correspondence_indices_ptr =
                             correspondence_indices.GetDataPtr<int64_t>();
 
-                    queue.parallel_for(
-                                 sycl::range<1>{(size_t)n},
-                                 [=](sycl::id<1> id) {
-                                     const int workload_idx = id[0];
-                                     scalar_t J_ij[6] = {0};
-                                     scalar_t r = 0;
-                                     const bool valid =
-                                             GetJacobianPointToPlane<scalar_t>(
-                                                     workload_idx,
-                                                     source_points_ptr,
-                                                     target_points_ptr,
-                                                     target_normals_ptr,
-                                                     correspondence_indices_ptr,
-                                                     J_ij, r);
-
-                                     if (valid) {
-                                         const scalar_t w =
-                                                 GetWeightFromRobustKernel(r);
-                                         int i = 0;
-                                         for (int j = 0; j < 6; ++j) {
-                                             for (int k = 0; k <= j; ++k) {
-                                                 sycl::atomic_ref<
-                                                         scalar_t,
-                                                         sycl::memory_order::
-                                                                 acq_rel,
-                                                         sycl::memory_scope::
-                                                                 device>(
-                                                         global_sum_ptr[i]) +=
-                                                         J_ij[j] * w * J_ij[k];
-                                                 ++i;
+                    queue.submit([&](sycl::handler &cgh) {
+                             cgh.parallel_for(
+                                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                                     [=](sycl::nd_item<1> item) {
+                                         const int gid =
+                                                 item.get_global_id(0);
+                                         scalar_t local_sum[kReduceDim] = {};
+
+                                         if (gid < n) {
+                                             scalar_t J_ij[6] = {0};
+                                             scalar_t r = 0;
+                                             const bool valid =
+                                                     GetJacobianPointToPlane<
+                                                             scalar_t>(
+                                                             gid,
+                                                             source_points_ptr,
+                                                             target_points_ptr,
+                                                             target_normals_ptr,
+                                                             correspondence_indices_ptr,
+                                                             J_ij, r);
+
+                                             if (valid) {
+                                                 const scalar_t w =
+                                                         GetWeightFromRobustKernel(
+                                                                 r);
+                                                 int i = 0;
+                                                 for (int j = 0; j < 6; ++j) {
+                                                     for (int k = 0; k <= j;
+                                                          ++k) {
+                                                         local_sum[i++] +=
+                                                                 J_ij[j] * w *
+                                                                 J_ij[k];
+                                                     }
+                                                     local_sum[21 + j] +=
+                                                             J_ij[j] * w * r;
+                                                 }
+                                                 local_sum[27] += r;
+                                                 local_sum[28] +=
+                                                         scalar_t(1);
                                              }
-                                             sycl::atomic_ref<
-                                                     scalar_t,
-                                                     sycl::memory_order::
-                                                             acq_rel,
-                                                     sycl::memory_scope::
-                                                             device>(
-                                                     global_sum_ptr[21 + j]) +=
-                                                     J_ij[j] * w * r;
                                          }
-                                         sycl::atomic_ref<
-                                                 scalar_t,
-                                                 sycl::memory_order::acq_rel,
-                                                 sycl::memory_scope::device>(
-                                                 global_sum_ptr[27]) += r;
-                                         sycl::atomic_ref<
-                                                 scalar_t,
-                                                 sycl::memory_order::acq_rel,
-                                                 sycl::memory_scope::device>(
-                                                 global_sum_ptr[28]) +=
-                                                 scalar_t(1);
-                                     }
-                                 })
-                            .wait_and_throw();
+
+                                         GroupReduceAndAdd<kReduceDim,
+                                                           scalar_t>(
+                                                 item, local_sum,
+                                                 global_sum_ptr);
+                                     });
+                         }).wait_and_throw();
                 });
     });
 
@@ -131,8 +147,11 @@ void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
 
     core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
 
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
 
     DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
         const scalar_t sqrt_lambda_geometric =
@@ -159,72 +178,70 @@ void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
                     const int64_t *correspondence_indices_ptr =
                             correspondence_indices.GetDataPtr<int64_t>();
 
-                    queue.parallel_for(
-                                 sycl::range<1>{(size_t)n},
-                                 [=](sycl::id<1> id) {
-                                     const int workload_idx = id[0];
-                                     scalar_t J_G[6] = {0}, J_I[6] = {0};
-                                     scalar_t r_G = 0, r_I = 0;
-
-                                     const bool valid =
-                                             GetJacobianColoredICP<scalar_t>(
-                                                     workload_idx,
-                                                     source_points_ptr,
-                                                     source_colors_ptr,
-                                                     target_points_ptr,
-                                                     target_normals_ptr,
-                                                     target_colors_ptr,
-                                                     target_color_gradients_ptr,
-                                                     correspondence_indices_ptr,
-                                                     sqrt_lambda_geometric,
-                                                     sqrt_lambda_photometric,
-                                                     J_G, J_I, r_G, r_I);
-
-                                     if (valid) {
-                                         const scalar_t w_G =
-                                                 GetWeightFromRobustKernel(r_G);
-                                         const scalar_t w_I =
-                                                 GetWeightFromRobustKernel(r_I);
-
-                                         int i = 0;
-                                         for (int j = 0; j < 6; ++j) {
-                                             for (int k = 0; k <= j; ++k) {
-                                                 sycl::atomic_ref<
-                                                         scalar_t,
-                                                         sycl::memory_order::
-                                                                 acq_rel,
-                                                         sycl::memory_scope::
-                                                                 device>(
-                                                         global_sum_ptr[i]) +=
-                                                         J_G[j] * w_G * J_G[k] +
-                                                         J_I[j] * w_I * J_I[k];
-                                                 ++i;
+                    queue.submit([&](sycl::handler &cgh) {
+                             cgh.parallel_for(
+                                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                                     [=](sycl::nd_item<1> item) {
+                                         const int gid =
+                                                 item.get_global_id(0);
+                                         scalar_t local_sum[kReduceDim] = {};
+
+                                         if (gid < n) {
+                                             scalar_t J_G[6] = {0},
+                                                      J_I[6] = {0};
+                                             scalar_t r_G = 0, r_I = 0;
+
+                                             const bool valid =
+                                                     GetJacobianColoredICP<
+                                                             scalar_t>(
+                                                             gid,
+                                                             source_points_ptr,
+                                                             source_colors_ptr,
+                                                             target_points_ptr,
+                                                             target_normals_ptr,
+                                                             target_colors_ptr,
+                                                             target_color_gradients_ptr,
+                                                             correspondence_indices_ptr,
+                                                             sqrt_lambda_geometric,
+                                                             sqrt_lambda_photometric,
+                                                             J_G, J_I, r_G,
+                                                             r_I);
+
+                                             if (valid) {
+                                                 const scalar_t w_G =
+                                                         GetWeightFromRobustKernel(
+                                                                 r_G);
+                                                 const scalar_t w_I =
+                                                         GetWeightFromRobustKernel(
+                                                                 r_I);
+
+                                                 int i = 0;
+                                                 for (int j = 0; j < 6; ++j) {
+                                                     for (int k = 0; k <= j;
+                                                          ++k) {
+                                                         local_sum[i++] +=
+                                                                 J_G[j] * w_G *
+                                                                         J_G[k] +
+                                                                 J_I[j] * w_I *
+                                                                         J_I[k];
+                                                     }
+                                                     local_sum[21 + j] +=
+                                                             J_G[j] * w_G *
+                                                                     r_G +
+                                                             J_I[j] * w_I * r_I;
+                                                 }
+                                                 local_sum[27] +=
+                                                         r_G * r_G + r_I * r_I;
+                                                 local_sum[28] += scalar_t(1);
                                              }
-                                             sycl::atomic_ref<
-                                                     scalar_t,
-                                                     sycl::memory_order::
-                                                             acq_rel,
-                                                     sycl::memory_scope::
-                                                             device>(
-                                                     global_sum_ptr[21 + j]) +=
-                                                     J_G[j] * w_G * r_G +
-                                                     J_I[j] * w_I * r_I;
                                          }
-                                         sycl::atomic_ref<
-                                                 scalar_t,
-                                                 sycl::memory_order::acq_rel,
-                                                 sycl::memory_scope::device>(
-                                                 global_sum_ptr[27]) +=
-                                                 r_G * r_G + r_I * r_I;
-                                         sycl::atomic_ref<
-                                                 scalar_t,
-                                                 sycl::memory_order::acq_rel,
-                                                 sycl::memory_scope::device>(
-                                                 global_sum_ptr[28]) +=
-                                                 scalar_t(1);
-                                     }
-                                 })
-                            .wait_and_throw();
+
+                                         GroupReduceAndAdd<kReduceDim,
+                                                           scalar_t>(
+                                                 item, local_sum,
+                                                 global_sum_ptr);
+                                     });
+                         }).wait_and_throw();
                 });
     });
 
@@ -258,8 +275,11 @@ void ComputePoseDopplerICPSYCL(
     core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
     core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device);
 
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
 
     DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
         const scalar_t sqrt_lambda_geometric =
@@ -278,8 +298,8 @@ void ComputePoseDopplerICPSYCL(
             scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr<scalar_t>();
             queue.single_task([=]() {
                      PreComputeForDopplerICP(R_S_to_V_ptr, r_v_to_s_in_V_ptr,
-                                             w_v_in_V_ptr, v_v_in_V_ptr,
-                                             v_s_in_S_ptr);
+                                            w_v_in_V_ptr, v_v_in_V_ptr,
+                                            v_s_in_S_ptr);
                  }).wait_and_throw();
         }
 
@@ -308,78 +328,76 @@ void ComputePoseDopplerICPSYCL(
                     const scalar_t *v_s_in_S_ptr =
                             v_s_in_S.GetDataPtr<scalar_t>();
 
-                    queue.parallel_for(
-                                 sycl::range<1>{(size_t)n},
-                                 [=](sycl::id<1> id) {
-                                     const int workload_idx = id[0];
-                                     scalar_t J_G[6] = {0}, J_D[6] = {0};
-                                     scalar_t r_G = 0, r_D = 0;
-
-                                     const bool valid = GetJacobianDopplerICP<
-                                             scalar_t>(
-                                             workload_idx, source_points_ptr,
-                                             source_dopplers_ptr,
-                                             source_directions_ptr,
-                                             target_points_ptr,
-                                             target_normals_ptr,
-                                             correspondence_indices_ptr,
-                                             R_S_to_V_ptr, r_v_to_s_in_V_ptr,
-                                             v_s_in_S_ptr,
-                                             reject_dynamic_outliers,
-                                             static_cast<scalar_t>(
-                                                     doppler_outlier_threshold),
-                                             sqrt_lambda_geometric,
-                                             sqrt_lambda_doppler,
-                                             sqrt_lambda_doppler_by_dt, J_G,
-                                             J_D, r_G, r_D);
-
-                                     if (valid) {
-                                         const scalar_t w_G =
-                                                 GetWeightFromRobustKernelFirst(
-                                                         r_G);
-                                         const scalar_t w_D =
-                                                 GetWeightFromRobustKernelSecond(
-                                                         r_D);
-
-                                         int i = 0;
-                                         for (int j = 0; j < 6; ++j) {
-                                             for (int k = 0; k <= j; ++k) {
-                                                 sycl::atomic_ref<
-                                                         scalar_t,
-                                                         sycl::memory_order::
-                                                                 acq_rel,
-                                                         sycl::memory_scope::
-                                                                 device>(
-                                                         global_sum_ptr[i]) +=
-                                                         J_G[j] * w_G * J_G[k] +
-                                                         J_D[j] * w_D * J_D[k];
-                                                 ++i;
+                    queue.submit([&](sycl::handler &cgh) {
+                             cgh.parallel_for(
+                                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                                     [=](sycl::nd_item<1> item) {
+                                         const int gid =
+                                                 item.get_global_id(0);
+                                         scalar_t local_sum[kReduceDim] = {};
+
+                                         if (gid < n) {
+                                             scalar_t J_G[6] = {0},
+                                                      J_D[6] = {0};
+                                             scalar_t r_G = 0, r_D = 0;
+
+                                             const bool valid =
+                                                     GetJacobianDopplerICP<
+                                                             scalar_t>(
+                                                             gid,
+                                                             source_points_ptr,
+                                                             source_dopplers_ptr,
+                                                             source_directions_ptr,
+                                                             target_points_ptr,
+                                                             target_normals_ptr,
+                                                             correspondence_indices_ptr,
+                                                             R_S_to_V_ptr,
+                                                             r_v_to_s_in_V_ptr,
+                                                             v_s_in_S_ptr,
+                                                             reject_dynamic_outliers,
+                                                             static_cast<scalar_t>(
+                                                                     doppler_outlier_threshold),
+                                                             sqrt_lambda_geometric,
+                                                             sqrt_lambda_doppler,
+                                                             sqrt_lambda_doppler_by_dt,
+                                                             J_G, J_D, r_G,
+                                                             r_D);
+
+                                             if (valid) {
+                                                 const scalar_t w_G =
+                                                         GetWeightFromRobustKernelFirst(
+                                                                 r_G);
+                                                 const scalar_t w_D =
+                                                         GetWeightFromRobustKernelSecond(
+                                                                 r_D);
+
+                                                 int i = 0;
+                                                 for (int j = 0; j < 6; ++j) {
+                                                     for (int k = 0; k <= j;
+                                                          ++k) {
+                                                         local_sum[i++] +=
+                                                                 J_G[j] * w_G *
+                                                                         J_G[k] +
+                                                                 J_D[j] * w_D *
+                                                                         J_D[k];
+                                                     }
+                                                     local_sum[21 + j] +=
+                                                             J_G[j] * w_G *
+                                                                     r_G +
+                                                             J_D[j] * w_D * r_D;
+                                                 }
+                                                 local_sum[27] +=
+                                                         r_G * r_G + r_D * r_D;
+                                                 local_sum[28] += scalar_t(1);
                                              }
-                                             sycl::atomic_ref<
-                                                     scalar_t,
-                                                     sycl::memory_order::
-                                                             acq_rel,
-                                                     sycl::memory_scope::
-                                                             device>(
-                                                     global_sum_ptr[21 + j]) +=
-                                                     J_G[j] * w_G * r_G +
-                                                     J_D[j] * w_D * r_D;
                                          }
-                                         sycl::atomic_ref<
-                                                 scalar_t,
-                                                 sycl::memory_order::acq_rel,
-                                                 sycl::memory_scope::device>(
-                                                 global_sum_ptr[27]) +=
-                                                 r_G * r_G + r_D * r_D;
-                                         sycl::atomic_ref<
-                                                 scalar_t,
-                                                 sycl::memory_order::acq_rel,
-                                                 sycl::memory_scope::device>(
-                                                 global_sum_ptr[28]) +=
-                                                 scalar_t(1);
-                                     }
-                                 })
-                            .wait_and_throw();
+
+                                         GroupReduceAndAdd<kReduceDim,
+                                                           scalar_t>(
+                                                 item, local_sum,
+                                                 global_sum_ptr);
+                                     });
+                         }).wait_and_throw();
                 });
     });
 
@@ -395,8 +413,11 @@ void ComputeInformationMatrixSYCL(const core::Tensor &target_points,
 
     core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device);
 
-    sycl::queue queue =
-            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
 
     DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
         scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
@@ -405,27 +426,38 @@ void ComputeInformationMatrixSYCL(const core::Tensor &target_points,
         const int64_t *correspondence_indices_ptr =
                 correspondence_indices.GetDataPtr<int64_t>();
 
-        queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
-                 const int workload_idx = id[0];
-                 scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
-                 const bool valid = GetInformationJacobians<scalar_t>(
-                         workload_idx, target_points_ptr,
-                         correspondence_indices_ptr, J_x, J_y, J_z);
-
-                 if (valid) {
-                     int i = 0;
-                     for (int j = 0; j < 6; ++j) {
-                         for (int k = 0; k <= j; ++k) {
-                             sycl::atomic_ref<scalar_t,
-                                              sycl::memory_order::acq_rel,
-                                              sycl::memory_scope::device>(
-                                     global_sum_ptr[i]) += J_x[j] * J_x[k] +
-                                                           J_y[j] * J_y[k] +
-                                                           J_z[j] * J_z[k];
-                             ++i;
-                         }
-                     }
-                 }
+        queue.submit([&](sycl::handler &cgh) {
+                 cgh.parallel_for(
+                         sycl::nd_range<1>{num_groups * wgs, wgs},
+                         [=](sycl::nd_item<1> item) {
+                             const int gid = item.get_global_id(0);
+                             scalar_t local_sum[21] = {};
+
+                             if (gid < n) {
+                                 scalar_t J_x[6] = {0}, J_y[6] = {0},
+                                          J_z[6] = {0};
+                                 const bool valid =
+                                         GetInformationJacobians<scalar_t>(
+                                                 gid, target_points_ptr,
+                                                 correspondence_indices_ptr,
+                                                 J_x, J_y, J_z);
+
+                                 if (valid) {
+                                     int i = 0;
+                                     for (int j = 0; j < 6; ++j) {
+                                         for (int k = 0; k <= j; ++k) {
+                                             local_sum[i++] +=
+                                                     J_x[j] * J_x[k] +
+                                                     J_y[j] * J_y[k] +
+                                                     J_z[j] * J_z[k];
+                                         }
+                                     }
+                                 }
+                             }
+
+                             GroupReduceAndAdd<21, scalar_t>(item, local_sum,
+                                                             global_sum_ptr);
+                         });
              }).wait_and_throw();
 
         const core::Device host(core::Device("CPU:0"));

From 27ccd459dd2f487bd202bd3ba6eec58cc89cf09b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 13 Mar 2026 18:13:32 +0000
Subject: [PATCH 08/14] Apply automatic code formatting

---
 cpp/open3d/data/Dataset.h                     |  20 +-
 cpp/open3d/geometry/PointCloud.h              |   3 +-
 .../GlobalOptimizationConvergenceCriteria.h   |   6 +-
 .../pipelines/registration/Registration.h     |   6 +-
 .../registration/TransformationEstimation.h   |   3 +-
 .../kernel/FillInLinearSystemSYCL.cpp         |  37 +-
 .../t/pipelines/kernel/RGBDOdometrySYCL.cpp   |  80 ++--
 .../t/pipelines/kernel/RegistrationSYCL.cpp   | 371 +++++++++++-------
 .../t/pipelines/TransformationConverter.cpp   |   7 +-
 .../t/pipelines/odometry/RGBDOdometry.cpp     |   7 +-
 .../t/pipelines/registration/Feature.cpp      |   7 +-
 .../t/pipelines/registration/Registration.cpp |   7 +-
 .../registration/TransformationEstimation.cpp |   7 +-
 cpp/tests/t/pipelines/slac/ControlGrid.cpp    |   7 +-
 cpp/tests/t/pipelines/slac/SLAC.cpp           |   7 +-
 15 files changed, 326 insertions(+), 249 deletions(-)

diff --git a/cpp/open3d/data/Dataset.h b/cpp/open3d/data/Dataset.h
index 27df77dfb14..e66a5370a2a 100644
--- a/cpp/open3d/data/Dataset.h
+++ b/cpp/open3d/data/Dataset.h
@@ -754,9 +754,9 @@ class PaintedPlasterTexture : public DownloadDataset {
 };
 
 /// \class RedwoodIndoorLivingRoom1
-/// \brief Data class for `RedwoodIndoorLivingRoom1` (Augmented ICL-NUIM Dataset),
-/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
-/// sequence, oni sequence, and ground-truth camera trajectory.
+/// \brief Data class for `RedwoodIndoorLivingRoom1` (Augmented ICL-NUIM
+/// Dataset), containing dense point cloud, rgb sequence, clean depth sequence,
+/// noisy depth sequence, oni sequence, and ground-truth camera trajectory.
 ///
 ///     RedwoodIndoorLivingRoom1
 ///     ├── colors
@@ -810,9 +810,9 @@ class RedwoodIndoorLivingRoom1 : public DownloadDataset {
 };
 
 /// \class RedwoodIndoorLivingRoom2
-/// \brief Data class for `RedwoodIndoorLivingRoom2` (Augmented ICL-NUIM Dataset),
-/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
-/// sequence, oni sequence, and ground-truth camera trajectory.
+/// \brief Data class for `RedwoodIndoorLivingRoom2` (Augmented ICL-NUIM
+/// Dataset), containing dense point cloud, rgb sequence, clean depth sequence,
+/// noisy depth sequence, oni sequence, and ground-truth camera trajectory.
 ///
 ///     RedwoodIndoorLivingRoom2
 ///     ├── colors
@@ -867,8 +867,8 @@ class RedwoodIndoorLivingRoom2 : public DownloadDataset {
 
 /// \class RedwoodIndoorOffice1
 /// \brief Data class for `RedwoodIndoorOffice1` (Augmented ICL-NUIM Dataset),
-/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
-/// sequence, oni sequence, and ground-truth camera trajectory.
+/// containing dense point cloud, rgb sequence, clean depth sequence, noisy
+/// depth sequence, oni sequence, and ground-truth camera trajectory.
 ///
 ///     RedwoodIndoorOffice1
 ///     ├── colors
@@ -923,8 +923,8 @@ class RedwoodIndoorOffice1 : public DownloadDataset {
 
 /// \class RedwoodIndoorOffice2
 /// \brief Data class for `RedwoodIndoorOffice2` (Augmented ICL-NUIM Dataset),
-/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
-/// sequence, oni sequence, and ground-truth camera trajectory.
+/// containing dense point cloud, rgb sequence, clean depth sequence, noisy
+/// depth sequence, oni sequence, and ground-truth camera trajectory.
 ///
 ///     RedwoodIndoorOffice2
 ///     ├── colors
diff --git a/cpp/open3d/geometry/PointCloud.h b/cpp/open3d/geometry/PointCloud.h
index 66e26676d18..c528095637d 100644
--- a/cpp/open3d/geometry/PointCloud.h
+++ b/cpp/open3d/geometry/PointCloud.h
@@ -281,7 +281,8 @@ class PointCloud : public Geometry3D {
     ///
     ///
     /// \param input PointCloud to use for covariance computation.
-    /// \param search_param The KDTree search parameters for neighborhood search.
+    /// \param search_param The KDTree search parameters for neighborhood
+    /// search.
     static std::vector<Eigen::Matrix3d> EstimatePerPointCovariances(
             const PointCloud &input,
             const KDTreeSearchParam &search_param = KDTreeSearchParamKNN());
diff --git a/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h b/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h
index 892c92ee8ce..ba4d283df58 100644
--- a/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h
+++ b/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h
@@ -81,9 +81,9 @@ class GlobalOptimizationConvergenceCriteria {
     /// increments.
     /// \param min_right_term Minimum right term value.
     /// \param min_residual Minimum residual value.
-    /// \param max_iteration_lm Maximum iteration number for Levenberg Marquardt method.
-    /// \param upper_scale_factor Upper scale factor value.
-    /// \param lower_scale_factor Lower scale factor value.
+    /// \param max_iteration_lm Maximum iteration number for Levenberg Marquardt
+    /// method. \param upper_scale_factor Upper scale factor value. \param
+    /// lower_scale_factor Lower scale factor value.
     GlobalOptimizationConvergenceCriteria(
             int max_iteration = 100,
             double min_relative_increment = 1e-6,
diff --git a/cpp/open3d/pipelines/registration/Registration.h b/cpp/open3d/pipelines/registration/Registration.h
index 427e1a78cb9..c2c2f91108b 100644
--- a/cpp/open3d/pipelines/registration/Registration.h
+++ b/cpp/open3d/pipelines/registration/Registration.h
@@ -39,9 +39,9 @@ class ICPConvergenceCriteria {
     ///
     /// \param relative_fitness If relative change (difference) of fitness score
     /// is lower than relative_fitness, the iteration stops.
-    /// \param relative_rmse If relative change (difference) of inliner RMSE score is
-    /// lower than relative_rmse, the iteration stops.
-    /// \param max_iteration Maximum iteration before iteration stops.
+    /// \param relative_rmse If relative change (difference) of inliner RMSE
+    /// score is lower than relative_rmse, the iteration stops. \param
+    /// max_iteration Maximum iteration before iteration stops.
     ICPConvergenceCriteria(double relative_fitness = 1e-6,
                            double relative_rmse = 1e-6,
                            int max_iteration = 30)
diff --git a/cpp/open3d/pipelines/registration/TransformationEstimation.h b/cpp/open3d/pipelines/registration/TransformationEstimation.h
index a953f40ef90..40b3229f6ca 100644
--- a/cpp/open3d/pipelines/registration/TransformationEstimation.h
+++ b/cpp/open3d/pipelines/registration/TransformationEstimation.h
@@ -139,7 +139,8 @@ class TransformationEstimationPointToPlane : public TransformationEstimation {
     ~TransformationEstimationPointToPlane() override {}
 
     /// \brief Constructor that takes as input a RobustKernel.
-    /// \param kernel Any of the implemented statistical robust kernel for outlier rejection.
+    /// \param kernel Any of the implemented statistical robust kernel for
+    /// outlier rejection.
     explicit TransformationEstimationPointToPlane(
             std::shared_ptr<RobustKernel> kernel)
         : kernel_(std::move(kernel)) {}
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
index c1c524005a7..a43767ba40e 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
@@ -76,13 +76,12 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
                              const float *normal_p_prime =
                                      Ri_normal_ps_ptr + 3 * gid;
 
-                             float r =
-                                     (p_prime[0] - q_prime[0]) *
-                                             normal_p_prime[0] +
-                                     (p_prime[1] - q_prime[1]) *
-                                             normal_p_prime[1] +
-                                     (p_prime[2] - q_prime[2]) *
-                                             normal_p_prime[2];
+                             float r = (p_prime[0] - q_prime[0]) *
+                                               normal_p_prime[0] +
+                                       (p_prime[1] - q_prime[1]) *
+                                               normal_p_prime[1] +
+                                       (p_prime[2] - q_prime[2]) *
+                                               normal_p_prime[2];
 
                              if (sycl::fabs(r) <= threshold) {
                                  float J_ij[12];
@@ -109,8 +108,7 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
                                      local_sum[kLocalDim12 + i_local] +=
                                              J_ij[i_local] * r;
                                  }
-                                 local_sum[kLocalDim12 + kLocalDimAtb] +=
-                                         r * r;
+                                 local_sum[kLocalDim12 + kLocalDimAtb] += r * r;
                              }
                          }
 
@@ -121,9 +119,9 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
                              float gv = sycl::reduce_over_group(
                                      grp, local_sum[k], sycl::plus<float>{});
                              if (item.get_local_id(0) == 0) {
-                                 sycl::atomic_ref<
-                                         float, sycl::memory_order::acq_rel,
-                                         sycl::memory_scope::device>(
+                                 sycl::atomic_ref<float,
+                                                  sycl::memory_order::acq_rel,
+                                                  sycl::memory_scope::device>(
                                          AtA_local_ptr[k]) += gv;
                              }
                          }
@@ -132,21 +130,20 @@ void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
                                      grp, local_sum[kLocalDim12 + k],
                                      sycl::plus<float>{});
                              if (item.get_local_id(0) == 0) {
-                                 sycl::atomic_ref<
-                                         float, sycl::memory_order::acq_rel,
-                                         sycl::memory_scope::device>(
+                                 sycl::atomic_ref<float,
+                                                  sycl::memory_order::acq_rel,
+                                                  sycl::memory_scope::device>(
                                          Atb_local_ptr[k]) += gv;
                              }
                          }
                          {
                              float gv = sycl::reduce_over_group(
-                                     grp,
-                                     local_sum[kLocalDim12 + kLocalDimAtb],
+                                     grp, local_sum[kLocalDim12 + kLocalDimAtb],
                                      sycl::plus<float>{});
                              if (item.get_local_id(0) == 0) {
-                                 sycl::atomic_ref<
-                                         float, sycl::memory_order::acq_rel,
-                                         sycl::memory_scope::device>(
+                                 sycl::atomic_ref<float,
+                                                  sycl::memory_order::acq_rel,
+                                                  sycl::memory_scope::device>(
                                          *residual_ptr) += gv;
                              }
                          }
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
index 59181b6d54c..c27b3159609 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
@@ -34,8 +34,8 @@ static inline void GroupReduceAndAdd(sycl::nd_item<1> item,
                                      float *global_sum_ptr) {
     auto grp = item.get_group();
     for (int k = 0; k < N; ++k) {
-        float grp_val = sycl::reduce_over_group(grp, local_sum[k],
-                                                sycl::plus<float>{});
+        float grp_val =
+                sycl::reduce_over_group(grp, local_sum[k], sycl::plus<float>{});
         if (item.get_local_id(0) == 0) {
             sycl::atomic_ref<float, sycl::memory_order::acq_rel,
                              sycl::memory_scope::device>(global_sum_ptr[k]) +=
@@ -113,8 +113,8 @@ void ComputeOdometryResultPointToPlaneSYCL(
                              }
                          }
 
-                         GroupReduceAndAdd<kReduceDimOdometry>(
-                                 item, local_sum, global_sum_ptr);
+                         GroupReduceAndAdd<kReduceDimOdometry>(item, local_sum,
+                                                               global_sum_ptr);
                      });
          }).wait_and_throw();
 
@@ -192,8 +192,7 @@ void ComputeOdometryResultIntensitySYCL(
                                  int offset = 0;
                                  for (int i = 0; i < 6; ++i) {
                                      for (int j = 0; j <= i; ++j) {
-                                         local_sum[offset++] +=
-                                                 J_I[i] * J_I[j];
+                                         local_sum[offset++] += J_I[i] * J_I[j];
                                      }
                                      local_sum[21 + i] += J_I[i] * d_huber;
                                  }
@@ -202,32 +201,31 @@ void ComputeOdometryResultIntensitySYCL(
                              }
                          }
 
-                         GroupReduceAndAdd<kReduceDimOdometry>(
-                                 item, local_sum, global_sum_ptr);
+                         GroupReduceAndAdd<kReduceDimOdometry>(item, local_sum,
+                                                               global_sum_ptr);
                      });
          }).wait_and_throw();
 
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
 
-void ComputeOdometryResultHybridSYCL(
-        const core::Tensor &source_depth,
-        const core::Tensor &target_depth,
-        const core::Tensor &source_intensity,
-        const core::Tensor &target_intensity,
-        const core::Tensor &target_depth_dx,
-        const core::Tensor &target_depth_dy,
-        const core::Tensor &target_intensity_dx,
-        const core::Tensor &target_intensity_dy,
-        const core::Tensor &source_vertex_map,
-        const core::Tensor &intrinsics,
-        const core::Tensor &init_source_to_target,
-        core::Tensor &delta,
-        float &inlier_residual,
-        int &inlier_count,
-        const float depth_outlier_trunc,
-        const float depth_huber_delta,
-        const float intensity_huber_delta) {
+void ComputeOdometryResultHybridSYCL(const core::Tensor &source_depth,
+                                     const core::Tensor &target_depth,
+                                     const core::Tensor &source_intensity,
+                                     const core::Tensor &target_intensity,
+                                     const core::Tensor &target_depth_dx,
+                                     const core::Tensor &target_depth_dy,
+                                     const core::Tensor &target_intensity_dx,
+                                     const core::Tensor &target_intensity_dy,
+                                     const core::Tensor &source_vertex_map,
+                                     const core::Tensor &intrinsics,
+                                     const core::Tensor &init_source_to_target,
+                                     core::Tensor &delta,
+                                     float &inlier_residual,
+                                     int &inlier_count,
+                                     const float depth_outlier_trunc,
+                                     const float depth_huber_delta,
+                                     const float intensity_huber_delta) {
     NDArrayIndexer source_depth_indexer(source_depth, 2);
     NDArrayIndexer target_depth_indexer(target_depth, 2);
     NDArrayIndexer source_intensity_indexer(source_intensity, 2);
@@ -290,11 +288,14 @@ void ComputeOdometryResultHybridSYCL(
                                                  J_D[i] * J_D[j];
                                      }
                                      local_sum[21 + i] +=
-                                             J_I[i] * HuberDeriv(
-                                                              r_I,
-                                                              intensity_huber_delta) +
-                                             J_D[i] * HuberDeriv(r_D,
-                                                                  depth_huber_delta);
+                                             J_I[i] *
+                                                     HuberDeriv(
+                                                             r_I,
+                                                             intensity_huber_delta) +
+                                             J_D[i] *
+                                                     HuberDeriv(
+                                                             r_D,
+                                                             depth_huber_delta);
                                  }
                                  local_sum[27] +=
                                          HuberLoss(r_I, intensity_huber_delta) +
@@ -303,21 +304,20 @@ void ComputeOdometryResultHybridSYCL(
                              }
                          }
 
-                         GroupReduceAndAdd<kReduceDimOdometry>(
-                                 item, local_sum, global_sum_ptr);
+                         GroupReduceAndAdd<kReduceDimOdometry>(item, local_sum,
+                                                               global_sum_ptr);
                      });
          }).wait_and_throw();
 
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
 
-void ComputeOdometryInformationMatrixSYCL(
-        const core::Tensor &source_vertex_map,
-        const core::Tensor &target_vertex_map,
-        const core::Tensor &intrinsic,
-        const core::Tensor &source_to_target,
-        const float square_dist_thr,
-        core::Tensor &information) {
+void ComputeOdometryInformationMatrixSYCL(const core::Tensor &source_vertex_map,
+                                          const core::Tensor &target_vertex_map,
+                                          const core::Tensor &intrinsic,
+                                          const core::Tensor &source_to_target,
+                                          const float square_dist_thr,
+                                          core::Tensor &information) {
     NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
     NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
 
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
index 80f4b2a4f73..c7503191fdb 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
@@ -79,22 +79,19 @@ void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
                              cgh.parallel_for(
                                      sycl::nd_range<1>{num_groups * wgs, wgs},
                                      [=](sycl::nd_item<1> item) {
-                                         const int gid =
-                                                 item.get_global_id(0);
+                                         const int gid = item.get_global_id(0);
                                          scalar_t local_sum[kReduceDim] = {};
 
                                          if (gid < n) {
                                              scalar_t J_ij[6] = {0};
                                              scalar_t r = 0;
-                                             const bool valid =
-                                                     GetJacobianPointToPlane<
-                                                             scalar_t>(
-                                                             gid,
-                                                             source_points_ptr,
-                                                             target_points_ptr,
-                                                             target_normals_ptr,
-                                                             correspondence_indices_ptr,
-                                                             J_ij, r);
+                                             const bool valid = GetJacobianPointToPlane<
+                                                     scalar_t>(
+                                                     gid, source_points_ptr,
+                                                     target_points_ptr,
+                                                     target_normals_ptr,
+                                                     correspondence_indices_ptr,
+                                                     J_ij, r);
 
                                              if (valid) {
                                                  const scalar_t w =
@@ -112,8 +109,7 @@ void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
                                                              J_ij[j] * w * r;
                                                  }
                                                  local_sum[27] += r;
-                                                 local_sum[28] +=
-                                                         scalar_t(1);
+                                                 local_sum[28] += scalar_t(1);
                                              }
                                          }
 
@@ -182,8 +178,7 @@ void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
                              cgh.parallel_for(
                                      sycl::nd_range<1>{num_groups * wgs, wgs},
                                      [=](sycl::nd_item<1> item) {
-                                         const int gid =
-                                                 item.get_global_id(0);
+                                         const int gid = item.get_global_id(0);
                                          scalar_t local_sum[kReduceDim] = {};
 
                                          if (gid < n) {
@@ -191,21 +186,18 @@ void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
                                                       J_I[6] = {0};
                                              scalar_t r_G = 0, r_I = 0;
 
-                                             const bool valid =
-                                                     GetJacobianColoredICP<
-                                                             scalar_t>(
-                                                             gid,
-                                                             source_points_ptr,
-                                                             source_colors_ptr,
-                                                             target_points_ptr,
-                                                             target_normals_ptr,
-                                                             target_colors_ptr,
-                                                             target_color_gradients_ptr,
-                                                             correspondence_indices_ptr,
-                                                             sqrt_lambda_geometric,
-                                                             sqrt_lambda_photometric,
-                                                             J_G, J_I, r_G,
-                                                             r_I);
+                                             const bool valid = GetJacobianColoredICP<
+                                                     scalar_t>(
+                                                     gid, source_points_ptr,
+                                                     source_colors_ptr,
+                                                     target_points_ptr,
+                                                     target_normals_ptr,
+                                                     target_colors_ptr,
+                                                     target_color_gradients_ptr,
+                                                     correspondence_indices_ptr,
+                                                     sqrt_lambda_geometric,
+                                                     sqrt_lambda_photometric,
+                                                     J_G, J_I, r_G, r_I);
 
                                              if (valid) {
                                                  const scalar_t w_G =
@@ -281,125 +273,205 @@ void ComputePoseDopplerICPSYCL(
     const size_t wgs = device_props.max_work_group_size;
     const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
 
-    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
-        const scalar_t sqrt_lambda_geometric =
-                sqrt(1.0 - static_cast<scalar_t>(lambda_doppler));
-        const scalar_t sqrt_lambda_doppler = sqrt(lambda_doppler);
-        const scalar_t sqrt_lambda_doppler_by_dt =
-                sqrt_lambda_doppler / static_cast<scalar_t>(period);
-
-        // Pre-compute v_s_in_S using a single-task kernel.
-        {
-            const scalar_t *R_S_to_V_ptr = R_S_to_V.GetDataPtr<scalar_t>();
-            const scalar_t *r_v_to_s_in_V_ptr =
-                    r_v_to_s_in_V.GetDataPtr<scalar_t>();
-            const scalar_t *w_v_in_V_ptr = w_v_in_V.GetDataPtr<scalar_t>();
-            const scalar_t *v_v_in_V_ptr = v_v_in_V.GetDataPtr<scalar_t>();
-            scalar_t *v_s_in_S_ptr = v_s_in_S.GetDataPtr<scalar_t>();
-            queue.single_task([=]() {
-                     PreComputeForDopplerICP(R_S_to_V_ptr, r_v_to_s_in_V_ptr,
-                                            w_v_in_V_ptr, v_v_in_V_ptr,
-                                            v_s_in_S_ptr);
-                 }).wait_and_throw();
-        }
-
-        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
-
-        DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(
-                scalar_t, kernel_geometric.type_,
-                kernel_geometric.scaling_parameter_, kernel_doppler.type_,
-                kernel_doppler.scaling_parameter_, [&]() {
-                    const scalar_t *source_points_ptr =
-                            source_points.GetDataPtr<scalar_t>();
-                    const scalar_t *source_dopplers_ptr =
-                            source_dopplers.GetDataPtr<scalar_t>();
-                    const scalar_t *source_directions_ptr =
-                            source_directions.GetDataPtr<scalar_t>();
-                    const scalar_t *target_points_ptr =
-                            target_points.GetDataPtr<scalar_t>();
-                    const scalar_t *target_normals_ptr =
-                            target_normals.GetDataPtr<scalar_t>();
-                    const int64_t *correspondence_indices_ptr =
-                            correspondence_indices.GetDataPtr<int64_t>();
-                    const scalar_t *R_S_to_V_ptr =
-                            R_S_to_V.GetDataPtr<scalar_t>();
-                    const scalar_t *r_v_to_s_in_V_ptr =
-                            r_v_to_s_in_V.GetDataPtr<scalar_t>();
-                    const scalar_t *v_s_in_S_ptr =
-                            v_s_in_S.GetDataPtr<scalar_t>();
-
-                    queue.submit([&](sycl::handler &cgh) {
-                             cgh.parallel_for(
-                                     sycl::nd_range<1>{num_groups * wgs, wgs},
-                                     [=](sycl::nd_item<1> item) {
-                                         const int gid =
-                                                 item.get_global_id(0);
-                                         scalar_t local_sum[kReduceDim] = {};
-
-                                         if (gid < n) {
-                                             scalar_t J_G[6] = {0},
-                                                      J_D[6] = {0};
-                                             scalar_t r_G = 0, r_D = 0;
-
-                                             const bool valid =
-                                                     GetJacobianDopplerICP<
-                                                             scalar_t>(
-                                                             gid,
-                                                             source_points_ptr,
-                                                             source_dopplers_ptr,
-                                                             source_directions_ptr,
-                                                             target_points_ptr,
-                                                             target_normals_ptr,
-                                                             correspondence_indices_ptr,
-                                                             R_S_to_V_ptr,
-                                                             r_v_to_s_in_V_ptr,
-                                                             v_s_in_S_ptr,
-                                                             reject_dynamic_outliers,
-                                                             static_cast<scalar_t>(
-                                                                     doppler_outlier_threshold),
-                                                             sqrt_lambda_geometric,
-                                                             sqrt_lambda_doppler,
-                                                             sqrt_lambda_doppler_by_dt,
-                                                             J_G, J_D, r_G,
-                                                             r_D);
-
-                                             if (valid) {
-                                                 const scalar_t w_G =
-                                                         GetWeightFromRobustKernelFirst(
-                                                                 r_G);
-                                                 const scalar_t w_D =
-                                                         GetWeightFromRobustKernelSecond(
-                                                                 r_D);
-
-                                                 int i = 0;
-                                                 for (int j = 0; j < 6; ++j) {
-                                                     for (int k = 0; k <= j;
-                                                          ++k) {
-                                                         local_sum[i++] +=
-                                                                 J_G[j] * w_G *
-                                                                         J_G[k] +
-                                                                 J_D[j] * w_D *
-                                                                         J_D[k];
-                                                     }
-                                                     local_sum[21 + j] +=
-                                                             J_G[j] * w_G *
-                                                                     r_G +
-                                                             J_D[j] * w_D * r_D;
-                                                 }
-                                                 local_sum[27] +=
-                                                         r_G * r_G + r_D * r_D;
-                                                 local_sum[28] += scalar_t(1);
-                                             }
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype,
+                                     [&]() {
+                                         const scalar_t sqrt_lambda_geometric =
+                                                 sqrt(1.0 -
+                                                      static_cast<scalar_t>(
+                                                              lambda_doppler));
+                                         const scalar_t sqrt_lambda_doppler =
+                                                 sqrt(lambda_doppler);
+                                         const scalar_t
+                                                 sqrt_lambda_doppler_by_dt =
+                                                         sqrt_lambda_doppler /
+                                                         static_cast<scalar_t>(
+                                                                 period);
+
+                                         // Pre-compute v_s_in_S using a
+                                         // single-task kernel.
+                                         {
+                                             const scalar_t *R_S_to_V_ptr =
+                                                     R_S_to_V.GetDataPtr<
+                                                             scalar_t>();
+                                             const scalar_t *r_v_to_s_in_V_ptr =
+                                                     r_v_to_s_in_V.GetDataPtr<
+                                                             scalar_t>();
+                                             const scalar_t *w_v_in_V_ptr =
+                                                     w_v_in_V.GetDataPtr<
+                                                             scalar_t>();
+                                             const scalar_t *v_v_in_V_ptr =
+                                                     v_v_in_V.GetDataPtr<
+                                                             scalar_t>();
+                                             scalar_t *v_s_in_S_ptr =
+                                                     v_s_in_S.GetDataPtr<
+                                                             scalar_t>();
+                                             queue.single_task([=]() {
+                                                      PreComputeForDopplerICP(
+                                                              R_S_to_V_ptr,
+                                                              r_v_to_s_in_V_ptr,
+                                                              w_v_in_V_ptr,
+                                                              v_v_in_V_ptr,
+                                                              v_s_in_S_ptr);
+                                                  }).wait_and_throw();
                                          }
 
-                                         GroupReduceAndAdd<kReduceDim,
-                                                           scalar_t>(
-                                                 item, local_sum,
-                                                 global_sum_ptr);
+                                         scalar_t *global_sum_ptr =
+                                                 global_sum.GetDataPtr<
+                                                         scalar_t>();
+
+                                         DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(scalar_t,
+                                                                              kernel_geometric
+                                                                                      .type_,
+                                                                              kernel_geometric
+                                                                                      .scaling_parameter_,
+                                                                              kernel_doppler
+                                                                                      .type_,
+                                                                              kernel_doppler
+                                                                                      .scaling_parameter_,
+                                                                              [&]() {
+                                                                                  const scalar_t *source_points_ptr =
+                                                                                          source_points
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *source_dopplers_ptr =
+                                                                                          source_dopplers
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *source_directions_ptr =
+                                                                                          source_directions
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *target_points_ptr =
+                                                                                          target_points
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *target_normals_ptr =
+                                                                                          target_normals
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const int64_t *correspondence_indices_ptr =
+                                                                                          correspondence_indices
+                                                                                                  .GetDataPtr<
+                                                                                                          int64_t>();
+                                                                                  const scalar_t *R_S_to_V_ptr =
+                                                                                          R_S_to_V.GetDataPtr<
+                                                                                                  scalar_t>();
+                                                                                  const scalar_t *r_v_to_s_in_V_ptr =
+                                                                                          r_v_to_s_in_V
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *v_s_in_S_ptr =
+                                                                                          v_s_in_S.GetDataPtr<
+                                                                                                  scalar_t>();
+
+                                                                                  queue
+                                                                                          .submit([&](sycl::handler
+                                                                                                              &cgh) {
+                                                                                              cgh.parallel_for(
+                                                                                                      sycl::nd_range<
+                                                                                                              1>{
+                                                                                                              num_groups *
+                                                                                                                      wgs,
+                                                                                                              wgs},
+                                                                                                      [=](sycl::nd_item<
+                                                                                                              1> item) {
+                                                                                                          const int gid =
+                                                                                                                  item.get_global_id(
+                                                                                                                          0);
+                                                                                                          scalar_t local_sum
+                                                                                                                  [kReduceDim] =
+                                                                                                                          {};
+
+                                                                                                          if (gid <
+                                                                                                              n) {
+                                                                                                              scalar_t J_G
+                                                                                                                      [6] = {0},
+                                                                                                                   J_D[6] = {
+                                                                                                                           0};
+                                                                                                              scalar_t
+                                                                                                                      r_G = 0,
+                                                                                                                      r_D = 0;
+
+                                                                                                              const bool valid = GetJacobianDopplerICP<
+                                                                                                                      scalar_t>(
+                                                                                                                      gid,
+                                                                                                                      source_points_ptr,
+                                                                                                                      source_dopplers_ptr,
+                                                                                                                      source_directions_ptr,
+                                                                                                                      target_points_ptr,
+                                                                                                                      target_normals_ptr,
+                                                                                                                      correspondence_indices_ptr,
+                                                                                                                      R_S_to_V_ptr,
+                                                                                                                      r_v_to_s_in_V_ptr,
+                                                                                                                      v_s_in_S_ptr,
+                                                                                                                      reject_dynamic_outliers,
+                                                                                                                      static_cast<
+                                                                                                                              scalar_t>(
+                                                                                                                              doppler_outlier_threshold),
+                                                                                                                      sqrt_lambda_geometric,
+                                                                                                                      sqrt_lambda_doppler,
+                                                                                                                      sqrt_lambda_doppler_by_dt,
+                                                                                                                      J_G,
+                                                                                                                      J_D,
+                                                                                                                      r_G,
+                                                                                                                      r_D);
+
+                                                                                                              if (valid) {
+                                                                                                                  const scalar_t
+                                                                                                                          w_G = GetWeightFromRobustKernelFirst(
+                                                                                                                                  r_G);
+                                                                                                                  const scalar_t
+                                                                                                                          w_D = GetWeightFromRobustKernelSecond(
+                                                                                                                                  r_D);
+
+                                                                                                                  int i = 0;
+                                                                                                                  for (int j = 0;
+                                                                                                                       j <
+                                                                                                                       6;
+                                                                                                                       ++j) {
+                                                                                                                      for (int k = 0;
+                                                                                                                           k <=
+                                                                                                                           j;
+                                                                                                                           ++k) {
+                                                                                                                          local_sum
+                                                                                                                                  [i++] +=
+                                                                                                                                  J_G[j] *
+                                                                                                                                          w_G *
+                                                                                                                                          J_G[k] +
+                                                                                                                                  J_D[j] *
+                                                                                                                                          w_D *
+                                                                                                                                          J_D[k];
+                                                                                                                      }
+                                                                                                                      local_sum
+                                                                                                                              [21 +
+                                                                                                                               j] +=
+                                                                                                                              J_G[j] *
+                                                                                                                                      w_G *
+                                                                                                                                      r_G +
+                                                                                                                              J_D[j] *
+                                                                                                                                      w_D *
+                                                                                                                                      r_D;
+                                                                                                                  }
+                                                                                                                  local_sum
+                                                                                                                          [27] +=
+                                                                                                                          r_G * r_G +
+                                                                                                                          r_D * r_D;
+                                                                                                                  local_sum[28] += scalar_t(
+                                                                                                                          1);
+                                                                                                              }
+                                                                                                          }
+
+                                                                                                          GroupReduceAndAdd<
+                                                                                                                  kReduceDim,
+                                                                                                                  scalar_t>(
+                                                                                                                  item,
+                                                                                                                  local_sum,
+                                                                                                                  global_sum_ptr);
+                                                                                                      });
+                                                                                          })
+                                                                                          .wait_and_throw();
+                                                                              });
                                      });
-                         }).wait_and_throw();
-                });
-    });
 
     DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count);
 }
@@ -446,10 +518,9 @@ void ComputeInformationMatrixSYCL(const core::Tensor &target_points,
                                      int i = 0;
                                      for (int j = 0; j < 6; ++j) {
                                          for (int k = 0; k <= j; ++k) {
-                                             local_sum[i++] +=
-                                                     J_x[j] * J_x[k] +
-                                                     J_y[j] * J_y[k] +
-                                                     J_z[j] * J_z[k];
+                                             local_sum[i++] += J_x[j] * J_x[k] +
+                                                               J_y[j] * J_y[k] +
+                                                               J_z[j] * J_z[k];
                                          }
                                      }
                                  }
diff --git a/cpp/tests/t/pipelines/TransformationConverter.cpp b/cpp/tests/t/pipelines/TransformationConverter.cpp
index 8036b55a08f..a4c092c62d6 100644
--- a/cpp/tests/t/pipelines/TransformationConverter.cpp
+++ b/cpp/tests/t/pipelines/TransformationConverter.cpp
@@ -15,9 +15,10 @@ namespace open3d {
 namespace tests {
 
 class TransformationConverterPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(TransformationConverter,
-                         TransformationConverterPermuteDevices,
-                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        TransformationConverter,
+        TransformationConverterPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(TransformationConverterPermuteDevices, RtToTransformation) {
     core::Device device = GetParam();
diff --git a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
index 7d1b8b6466b..64d2a97d4a7 100644
--- a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
+++ b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
@@ -22,9 +22,10 @@ namespace open3d {
 namespace tests {
 
 class OdometryPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(Odometry,
-                         OdometryPermuteDevices,
-                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        Odometry,
+        OdometryPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 core::Tensor CreateIntrisicTensor() {
     camera::PinholeCameraIntrinsic intrinsic = camera::PinholeCameraIntrinsic(
diff --git a/cpp/tests/t/pipelines/registration/Feature.cpp b/cpp/tests/t/pipelines/registration/Feature.cpp
index 1266ab7f705..7cb5f27ea8c 100644
--- a/cpp/tests/t/pipelines/registration/Feature.cpp
+++ b/cpp/tests/t/pipelines/registration/Feature.cpp
@@ -21,9 +21,10 @@ namespace open3d {
 namespace tests {
 
 class FeaturePermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(Feature,
-                         FeaturePermuteDevices,
-                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        Feature,
+        FeaturePermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(FeaturePermuteDevices, SelectByIndex) {
     core::Device device = GetParam();
diff --git a/cpp/tests/t/pipelines/registration/Registration.cpp b/cpp/tests/t/pipelines/registration/Registration.cpp
index 76b31da3dd9..d3d0544e27c 100644
--- a/cpp/tests/t/pipelines/registration/Registration.cpp
+++ b/cpp/tests/t/pipelines/registration/Registration.cpp
@@ -27,9 +27,10 @@ namespace open3d {
 namespace tests {
 
 class RegistrationPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(Registration,
-                         RegistrationPermuteDevices,
-                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        Registration,
+        RegistrationPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(RegistrationPermuteDevices, ICPConvergenceCriteriaConstructor) {
     // Constructor.
diff --git a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
index c98ab268529..afc7f40e7a8 100644
--- a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
+++ b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
@@ -16,9 +16,10 @@ namespace open3d {
 namespace tests {
 
 class TransformationEstimationPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(TransformationEstimation,
-                         TransformationEstimationPermuteDevices,
-                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        TransformationEstimation,
+        TransformationEstimationPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 static std::
         tuple<t::geometry::PointCloud, t::geometry::PointCloud, core::Tensor>
diff --git a/cpp/tests/t/pipelines/slac/ControlGrid.cpp b/cpp/tests/t/pipelines/slac/ControlGrid.cpp
index 5e8e5d59949..7f716f553ee 100644
--- a/cpp/tests/t/pipelines/slac/ControlGrid.cpp
+++ b/cpp/tests/t/pipelines/slac/ControlGrid.cpp
@@ -25,9 +25,10 @@ static t::geometry::PointCloud CreateTPCDFromFile(
 }
 
 class ControlGridPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(ControlGrid,
-                         ControlGridPermuteDevices,
-                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        ControlGrid,
+        ControlGridPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 // TODO(wei): more well-designed test cases
 TEST_P(ControlGridPermuteDevices, Touch) {
diff --git a/cpp/tests/t/pipelines/slac/SLAC.cpp b/cpp/tests/t/pipelines/slac/SLAC.cpp
index 73aef1ad212..00926a38741 100644
--- a/cpp/tests/t/pipelines/slac/SLAC.cpp
+++ b/cpp/tests/t/pipelines/slac/SLAC.cpp
@@ -28,9 +28,10 @@ namespace open3d {
 namespace tests {
 
 class SLACPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(SLAC,
-                         SLACPermuteDevices,
-                         testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        SLAC,
+        SLACPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 // PointCloud is similar if fitness is higher and rmse is lower than tolerance
 // threshold.

From 83753680dc361499f8810968156931bef19ab8f9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 07:18:18 +0000
Subject: [PATCH 09/14] Initial plan


From 4018268dcb51e73d0d9a4f4228b2aa666f9f89fd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 07:30:51 +0000
Subject: [PATCH 10/14] Add SYCL nearest-neighbor search support

Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com>
---
 cpp/open3d/core/CMakeLists.txt                |   1 +
 cpp/open3d/core/nns/FixedRadiusIndex.cpp      |  28 +
 cpp/open3d/core/nns/FixedRadiusIndex.h        |  34 ++
 cpp/open3d/core/nns/KnnIndex.cpp              |  26 +-
 cpp/open3d/core/nns/KnnIndex.h                |  18 +-
 cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp      | 511 ++++++++++++++++++
 cpp/open3d/core/nns/NearestNeighborSearch.cpp |  47 +-
 cpp/open3d/core/nns/NearestNeighborSearch.h   |  20 +-
 .../core/nns/nearest_neighbor_search.cpp      |  14 +-
 cpp/tests/core/NearestNeighborSearch.cpp      |   2 +-
 python/test/core/test_nn.py                   |   6 +-
 11 files changed, 670 insertions(+), 37 deletions(-)
 create mode 100644 cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp

diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt
index 210cc33058a..7821b4a8f97 100644
--- a/cpp/open3d/core/CMakeLists.txt
+++ b/cpp/open3d/core/CMakeLists.txt
@@ -102,6 +102,7 @@ open3d_sycl_target_sources(core_impl PRIVATE
     linalg/LeastSquaresSYCL.cpp
     linalg/LUSYCL.cpp
     linalg/MatmulSYCL.cpp
+    nns/KnnSearchOpsSYCL.cpp
     linalg/SolveSYCL.cpp
     linalg/SVDSYCL.cpp
     linalg/TriSYCL.cpp
diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.cpp b/cpp/open3d/core/nns/FixedRadiusIndex.cpp
index 167a6701a32..b1254c396e1 100644
--- a/cpp/open3d/core/nns/FixedRadiusIndex.cpp
+++ b/cpp/open3d/core/nns/FixedRadiusIndex.cpp
@@ -106,6 +106,14 @@ bool FixedRadiusIndex::SetTensorData(const Tensor &dataset_points,
         utility::LogError(
                 "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
                 "-DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        return true;
+#else
+        utility::LogError(
+                "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with "
+                "-DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         CALL_BUILD(float, BuildSpatialHashTableCPU)
@@ -173,6 +181,16 @@ std::tuple<Tensor, Tensor, Tensor> FixedRadiusIndex::SearchRadius(
         utility::LogError(
                 "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
                 "-DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
+            FixedRadiusSearchSYCL<scalar_t, int_t>(RADIUS_PARAMETERS);
+        });
+#else
+        utility::LogError(
+                "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with "
+                "-DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
@@ -242,6 +260,16 @@ std::tuple<Tensor, Tensor, Tensor> FixedRadiusIndex::SearchHybrid(
         utility::LogError(
                 "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
                 "-DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
+            HybridSearchSYCL<scalar_t, int_t>(HYBRID_PARAMETERS);
+        });
+#else
+        utility::LogError(
+                "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with "
+                "-DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.h b/cpp/open3d/core/nns/FixedRadiusIndex.h
index 838c1ccc484..d3247bb7d29 100644
--- a/cpp/open3d/core/nns/FixedRadiusIndex.h
+++ b/cpp/open3d/core/nns/FixedRadiusIndex.h
@@ -376,6 +376,40 @@ void HybridSearchCUDA(const Tensor& points,
                       Tensor& neighbors_distance);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+template <class T, class TIndex>
+void FixedRadiusSearchSYCL(const Tensor& points,
+                           const Tensor& queries,
+                           double radius,
+                           const Tensor& points_row_splits,
+                           const Tensor& queries_row_splits,
+                           const Tensor& hash_table_splits,
+                           const Tensor& hash_table_index,
+                           const Tensor& hash_table_cell_splits,
+                           const Metric metric,
+                           const bool ignore_query_point,
+                           const bool return_distances,
+                           const bool sort,
+                           Tensor& neighbors_index,
+                           Tensor& neighbors_row_splits,
+                           Tensor& neighbors_distance);
+
+template <class T, class TIndex>
+void HybridSearchSYCL(const Tensor& points,
+                      const Tensor& queries,
+                      double radius,
+                      int max_knn,
+                      const Tensor& points_row_splits,
+                      const Tensor& queries_row_splits,
+                      const Tensor& hash_table_splits,
+                      const Tensor& hash_table_index,
+                      const Tensor& hash_table_cell_splits,
+                      const Metric metric,
+                      Tensor& neighbors_index,
+                      Tensor& neighbors_count,
+                      Tensor& neighbors_distance);
+#endif
+
 /// \class FixedRadiusIndex
 ///
 /// \brief FixedRadiusIndex for nearest neighbor range search.
diff --git a/cpp/open3d/core/nns/KnnIndex.cpp b/cpp/open3d/core/nns/KnnIndex.cpp
index 2d1f5de6b51..596ad92d69d 100644
--- a/cpp/open3d/core/nns/KnnIndex.cpp
+++ b/cpp/open3d/core/nns/KnnIndex.cpp
@@ -67,6 +67,17 @@ bool KnnIndex::SetTensorData(const Tensor& dataset_points,
         utility::LogError(
                 "GPU Tensor is not supported when -DBUILD_CUDA_MODULE=OFF. "
                 "Please recompile Open3d With -DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (dataset_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        dataset_points_ = dataset_points.Contiguous();
+        points_row_splits_ = points_row_splits.Contiguous();
+        index_dtype_ = index_dtype;
+        return true;
+#else
+        utility::LogError(
+                "SYCL Tensor is not supported when -DBUILD_SYCL_MODULE=OFF. "
+                "Please recompile Open3D with -DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         utility::LogError(
@@ -126,11 +137,22 @@ std::pair<Tensor, Tensor> KnnIndex::SearchKnn(const Tensor& query_points,
         utility::LogError(
                 "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
                 "-DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        const Dtype index_dtype = GetIndexDtype();
+        DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
+            KnnSearchSYCL<scalar_t, int_t>(KNN_PARAMETERS);
+        });
+#else
+        utility::LogError(
+                "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with "
+                "-DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
-                "-DBUILD_CUDA_MODULE=ON.");
+                "KnnIndex only supports CUDA and SYCL tensors. Please use "
+                "NanoFlannIndex instead for CPU tensors.");
     }
     return std::make_pair(neighbors_index, neighbors_distance);
 }
diff --git a/cpp/open3d/core/nns/KnnIndex.h b/cpp/open3d/core/nns/KnnIndex.h
index 458a3dd0b35..6628c92850c 100644
--- a/cpp/open3d/core/nns/KnnIndex.h
+++ b/cpp/open3d/core/nns/KnnIndex.h
@@ -29,14 +29,28 @@ void KnnSearchCUDA(const Tensor& points,
                    Tensor& neighbors_distance);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+template <class T, class TIndex>
+void KnnSearchSYCL(const Tensor& points,
+                   const Tensor& points_row_splits,
+                   const Tensor& queries,
+                   const Tensor& queries_row_splits,
+                   int knn,
+                   Tensor& neighbors_index,
+                   Tensor& neighbors_row_splits,
+                   Tensor& neighbors_distance);
+#endif
+
 class KnnIndex : public NNSIndex {
 public:
     KnnIndex();
 
     /// \brief Parameterized Constructor.
     ///
-    /// \param dataset_points Provides a set of data points as Tensor for KDTree
-    /// construction.
+    /// \param dataset_points Provides a set of data points as Tensor for nearest
+    /// neighbor search. CPU tensors use NanoFlann through
+    /// open3d::core::nns::NearestNeighborSearch, while CUDA and SYCL tensors are
+    /// handled by this class.
     KnnIndex(const Tensor& dataset_points);
     KnnIndex(const Tensor& dataset_points, const Dtype& index_dtype);
     ~KnnIndex();
diff --git a/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp
new file mode 100644
index 00000000000..5bb4f2d02a7
--- /dev/null
+++ b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp
@@ -0,0 +1,511 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <algorithm>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+#include "open3d/core/MemoryManager.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/nns/FixedRadiusIndex.h"
+#include "open3d/core/nns/KnnIndex.h"
+#include "open3d/core/nns/NeighborSearchAllocator.h"
+#include "open3d/utility/Helper.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace nns {
+
+namespace {
+
+template <typename T>
+void ComputeSquaredDistancesSYCL(const Device& device,
+                                 const T* points_ptr,
+                                 int64_t num_points,
+                                 int64_t dimension,
+                                 const T* query_ptr,
+                                 T* distances_ptr) {
+    if (num_points == 0) {
+        return;
+    }
+
+    constexpr int64_t kWorkGroupSize = 128;
+    const int64_t global_size =
+            utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize;
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    queue.submit([&](sycl::handler& cgh) {
+             sycl::local_accessor<T, 1> local_query(
+                     sycl::range<1>(dimension), cgh);
+             cgh.parallel_for(
+                     sycl::nd_range<1>(sycl::range<1>(global_size),
+                                       sycl::range<1>(kWorkGroupSize)),
+                     [=](sycl::nd_item<1> item) {
+                         const int64_t lid = item.get_local_linear_id();
+                         for (int64_t d = lid; d < dimension;
+                              d += item.get_local_range(0)) {
+                             local_query[d] = query_ptr[d];
+                         }
+                         item.barrier(sycl::access::fence_space::local_space);
+
+                         const int64_t point_idx = item.get_global_linear_id();
+                         if (point_idx >= num_points) {
+                             return;
+                         }
+
+                         const T* point = points_ptr + point_idx * dimension;
+                         T dist = 0;
+                         for (int64_t d = 0; d < dimension; ++d) {
+                             const T diff = point[d] - local_query[d];
+                             dist += diff * diff;
+                         }
+                         distances_ptr[point_idx] = dist;
+                     });
+         }).wait_and_throw();
+}
+
+template <typename T, typename TIndex>
+void SortIndicesByDistanceSYCL(const Device& device,
+                               const T* distances_ptr,
+                               TIndex* indices_ptr,
+                               int64_t num_points) {
+    if (num_points <= 1) {
+        return;
+    }
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto policy = oneapi::dpl::execution::make_device_policy(queue);
+    std::sort(policy, indices_ptr, indices_ptr + num_points,
+              [distances_ptr](TIndex lhs, TIndex rhs) {
+                  const T lhs_dist = distances_ptr[lhs];
+                  const T rhs_dist = distances_ptr[rhs];
+                  if (lhs_dist < rhs_dist) {
+                      return true;
+                  }
+                  if (rhs_dist < lhs_dist) {
+                      return false;
+                  }
+                  return lhs < rhs;
+              });
+    queue.wait_and_throw();
+}
+
+template <typename T, typename TIndex>
+int64_t CountWithinRadiusSYCL(const Device& device,
+                              const T* distances_ptr,
+                              const TIndex* sorted_indices_ptr,
+                              int64_t num_points,
+                              T threshold) {
+    if (num_points == 0) {
+        return 0;
+    }
+
+    constexpr int64_t kWorkGroupSize = 128;
+    const int64_t global_size =
+            utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize;
+    Tensor count = Tensor::Zeros({1}, Int64, device);
+    int64_t* count_ptr = count.GetDataPtr<int64_t>();
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    queue.submit([&](sycl::handler& cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>(sycl::range<1>(global_size),
+                                       sycl::range<1>(kWorkGroupSize)),
+                     [=](sycl::nd_item<1> item) {
+                         const int64_t idx = item.get_global_linear_id();
+                         int local_count = 0;
+                         if (idx < num_points) {
+                             local_count =
+                                     distances_ptr[sorted_indices_ptr[idx]] <=
+                                             threshold
+                                     ? 1
+                                     : 0;
+                         }
+                         auto group = item.get_group();
+                         const int group_count = sycl::reduce_over_group(
+                                 group, local_count, sycl::plus<int>());
+                         if (item.get_local_linear_id() == 0 &&
+                             group_count > 0) {
+                             sycl::atomic_ref<int64_t,
+                                              sycl::memory_order::relaxed,
+                                              sycl::memory_scope::device>
+                                     count_ref(*count_ptr);
+                             count_ref += group_count;
+                         }
+                     });
+         }).wait_and_throw();
+
+    return count.To(Device("CPU:0")).Item<int64_t>();
+}
+
+template <typename T, typename TIndex>
+void GatherResultsSYCL(const Device& device,
+                       const T* distances_ptr,
+                       const TIndex* sorted_indices_ptr,
+                       int64_t num_results,
+                       TIndex index_offset,
+                       TIndex* out_indices_ptr,
+                       T* out_distances_ptr) {
+    if (num_results == 0) {
+        return;
+    }
+
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>(num_results), [=](sycl::id<1> id) {
+             const int64_t i = id[0];
+             const TIndex point_idx = sorted_indices_ptr[i];
+             out_indices_ptr[i] = point_idx + index_offset;
+             if (out_distances_ptr != nullptr) {
+                 out_distances_ptr[i] = distances_ptr[point_idx];
+             }
+         }).wait_and_throw();
+}
+
+template <typename T, typename TIndex>
+void ProcessQuerySYCL(const Tensor& points,
+                      const Tensor& query,
+                      int64_t num_results,
+                      TIndex index_offset,
+                      TIndex* out_indices_ptr,
+                      T* out_distances_ptr) {
+    const Device device = points.GetDevice();
+    const int64_t num_points = points.GetShape(0);
+
+    Tensor distances = Tensor::Empty({num_points}, points.GetDtype(), device);
+    Tensor sorted_indices = Tensor::Arange(
+            0, num_points, 1, Dtype::FromType<TIndex>(), device);
+
+    ComputeSquaredDistancesSYCL<T>(device, points.GetDataPtr<T>(), num_points,
+                                   points.GetShape(1), query.GetDataPtr<T>(),
+                                   distances.GetDataPtr<T>());
+    SortIndicesByDistanceSYCL<T, TIndex>(device, distances.GetDataPtr<T>(),
+                                         sorted_indices.GetDataPtr<TIndex>(),
+                                         num_points);
+    GatherResultsSYCL<T, TIndex>(device, distances.GetDataPtr<T>(),
+                                 sorted_indices.GetDataPtr<TIndex>(),
+                                 num_results, index_offset, out_indices_ptr,
+                                 out_distances_ptr);
+}
+
+template <typename T, typename TIndex>
+int64_t CountQueryResultsWithinRadiusSYCL(const Tensor& points,
+                                          const Tensor& query,
+                                          T threshold) {
+    const Device device = points.GetDevice();
+    const int64_t num_points = points.GetShape(0);
+
+    Tensor distances = Tensor::Empty({num_points}, points.GetDtype(), device);
+    Tensor sorted_indices = Tensor::Arange(
+            0, num_points, 1, Dtype::FromType<TIndex>(), device);
+
+    ComputeSquaredDistancesSYCL<T>(device, points.GetDataPtr<T>(), num_points,
+                                   points.GetShape(1), query.GetDataPtr<T>(),
+                                   distances.GetDataPtr<T>());
+    SortIndicesByDistanceSYCL<T, TIndex>(device, distances.GetDataPtr<T>(),
+                                         sorted_indices.GetDataPtr<TIndex>(),
+                                         num_points);
+    return CountWithinRadiusSYCL<T, TIndex>(
+            device, distances.GetDataPtr<T>(), sorted_indices.GetDataPtr<TIndex>(),
+            num_points, threshold);
+}
+
+template <typename T, typename TIndex>
+void GatherQueryResultsWithinRadiusSYCL(const Tensor& points,
+                                        const Tensor& query,
+                                        int64_t num_results,
+                                        TIndex index_offset,
+                                        TIndex* out_indices_ptr,
+                                        T* out_distances_ptr) {
+    ProcessQuerySYCL<T, TIndex>(points, query, num_results, index_offset,
+                                out_indices_ptr, out_distances_ptr);
+}
+
+}  // namespace
+
+template <class T, class TIndex>
+void KnnSearchSYCL(const Tensor& points,
+                   const Tensor& points_row_splits,
+                   const Tensor& queries,
+                   const Tensor& queries_row_splits,
+                   int knn,
+                   Tensor& neighbors_index,
+                   Tensor& neighbors_row_splits,
+                   Tensor& neighbors_distance) {
+    const Device device = points.GetDevice();
+    const int batch_size = points_row_splits.GetShape(0) - 1;
+    std::vector<NeighborSearchAllocator<T, TIndex>> batch_output_allocators(
+            batch_size, NeighborSearchAllocator<T, TIndex>(device));
+
+    int64_t* neighbors_row_splits_ptr =
+            neighbors_row_splits.GetDataPtr<int64_t>();
+    int64_t last_neighbors_count = 0;
+    int64_t batch_knn = 0;
+
+    for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+        const int64_t point_begin = points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end = points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t query_begin =
+                queries_row_splits[batch_idx].Item<int64_t>();
+        const int64_t query_end =
+                queries_row_splits[batch_idx + 1].Item<int64_t>();
+
+        const Tensor points_i = points.Slice(0, point_begin, point_end);
+        const Tensor queries_i = queries.Slice(0, query_begin, query_end);
+        const int64_t num_points_i = points_i.GetShape(0);
+        const int64_t num_queries_i = queries_i.GetShape(0);
+        batch_knn = std::min<int64_t>(knn, num_points_i);
+
+        neighbors_row_splits_ptr[query_begin] = last_neighbors_count;
+        for (int64_t q = 0; q < num_queries_i; ++q) {
+            neighbors_row_splits_ptr[query_begin + q + 1] =
+                    last_neighbors_count + (q + 1) * batch_knn;
+        }
+        last_neighbors_count += num_queries_i * batch_knn;
+
+        TIndex* indices_ptr;
+        T* distances_ptr;
+        batch_output_allocators[batch_idx].AllocIndices(
+                &indices_ptr, num_queries_i * batch_knn, TIndex(-1));
+        batch_output_allocators[batch_idx].AllocDistances(
+                &distances_ptr, num_queries_i * batch_knn, T(0));
+
+        for (int64_t q = 0; q < num_queries_i; ++q) {
+            const Tensor query = queries_i.Slice(0, q, q + 1).Flatten();
+            ProcessQuerySYCL<T, TIndex>(
+                    points_i, query, batch_knn, TIndex(point_begin),
+                    indices_ptr + q * batch_knn, distances_ptr + q * batch_knn);
+        }
+    }
+
+    if (batch_size == 1) {
+        neighbors_index =
+                batch_output_allocators[0].NeighborsIndex().View({queries.GetShape(0),
+                                                                  batch_knn});
+        neighbors_distance = batch_output_allocators[0].NeighborsDistance().View(
+                {queries.GetShape(0), batch_knn});
+        return;
+    }
+
+    NeighborSearchAllocator<T, TIndex> output_allocator(device);
+    int64_t neighbors_size = 0;
+    for (const auto& allocator : batch_output_allocators) {
+        neighbors_size += allocator.NeighborsIndex().GetShape(0);
+    }
+
+    TIndex* neighbors_index_ptr;
+    T* neighbors_distance_ptr;
+    output_allocator.AllocIndices(&neighbors_index_ptr, neighbors_size);
+    output_allocator.AllocDistances(&neighbors_distance_ptr, neighbors_size);
+
+    int64_t offset = 0;
+    for (const auto& allocator : batch_output_allocators) {
+        const int64_t batch_size_i = allocator.NeighborsIndex().GetShape(0);
+        if (batch_size_i == 0) {
+            continue;
+        }
+        MemoryManager::Memcpy(neighbors_index_ptr + offset, device,
+                              allocator.IndicesPtr(), device,
+                              sizeof(TIndex) * batch_size_i);
+        MemoryManager::Memcpy(neighbors_distance_ptr + offset, device,
+                              allocator.DistancesPtr(), device,
+                              sizeof(T) * batch_size_i);
+        offset += batch_size_i;
+    }
+    neighbors_index = output_allocator.NeighborsIndex();
+    neighbors_distance = output_allocator.NeighborsDistance();
+}
+
+template <class T, class TIndex>
+void FixedRadiusSearchSYCL(const Tensor& points,
+                           const Tensor& queries,
+                           double radius,
+                           const Tensor& points_row_splits,
+                           const Tensor& queries_row_splits,
+                           const Tensor&,
+                           const Tensor&,
+                           const Tensor&,
+                           const Metric metric,
+                           const bool ignore_query_point,
+                           const bool return_distances,
+                           const bool,
+                           Tensor& neighbors_index,
+                           Tensor& neighbors_row_splits,
+                           Tensor& neighbors_distance) {
+    if (metric != Metric::L2) {
+        utility::LogError("SYCL fixed radius search only supports L2 metric.");
+    }
+    if (ignore_query_point) {
+        utility::LogError(
+                "SYCL fixed radius search does not support ignore_query_point.");
+    }
+
+    const Device device = points.GetDevice();
+    const int64_t num_queries = queries.GetShape(0);
+    const T threshold = static_cast<T>(radius * radius);
+    std::vector<int64_t> counts(num_queries, 0);
+
+    for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
+         ++batch_idx) {
+        const int64_t point_begin = points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end = points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t query_begin =
+                queries_row_splits[batch_idx].Item<int64_t>();
+        const int64_t query_end =
+                queries_row_splits[batch_idx + 1].Item<int64_t>();
+        const Tensor points_i = points.Slice(0, point_begin, point_end);
+
+        for (int64_t q = query_begin; q < query_end; ++q) {
+            const Tensor query = queries.Slice(0, q, q + 1).Flatten();
+            counts[q] = CountQueryResultsWithinRadiusSYCL<T, TIndex>(
+                    points_i, query, threshold);
+        }
+    }
+
+    std::vector<int64_t> row_splits(num_queries + 1, 0);
+    for (int64_t q = 0; q < num_queries; ++q) {
+        row_splits[q + 1] = row_splits[q] + counts[q];
+    }
+    neighbors_row_splits =
+            Tensor(row_splits, {num_queries + 1}, Int64).To(device);
+
+    NeighborSearchAllocator<T, TIndex> output_allocator(device);
+    TIndex* neighbors_index_ptr;
+    T* neighbors_distance_ptr;
+    output_allocator.AllocIndices(&neighbors_index_ptr, row_splits.back());
+    if (return_distances) {
+        output_allocator.AllocDistances(&neighbors_distance_ptr,
+                                        row_splits.back());
+    } else {
+        output_allocator.AllocDistances(&neighbors_distance_ptr, 0);
+    }
+
+    for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
+         ++batch_idx) {
+        const int64_t point_begin = points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end = points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t query_begin =
+                queries_row_splits[batch_idx].Item<int64_t>();
+        const int64_t query_end =
+                queries_row_splits[batch_idx + 1].Item<int64_t>();
+        const Tensor points_i = points.Slice(0, point_begin, point_end);
+
+        for (int64_t q = query_begin; q < query_end; ++q) {
+            if (counts[q] == 0) {
+                continue;
+            }
+            const Tensor query = queries.Slice(0, q, q + 1).Flatten();
+            GatherQueryResultsWithinRadiusSYCL<T, TIndex>(
+                    points_i, query, counts[q], TIndex(point_begin),
+                    neighbors_index_ptr + row_splits[q],
+                    return_distances ? neighbors_distance_ptr + row_splits[q]
+                                     : nullptr);
+        }
+    }
+
+    neighbors_index = output_allocator.NeighborsIndex();
+    neighbors_distance = output_allocator.NeighborsDistance();
+}
+
+template <class T, class TIndex>
+void HybridSearchSYCL(const Tensor& points,
+                      const Tensor& queries,
+                      double radius,
+                      int max_knn,
+                      const Tensor& points_row_splits,
+                      const Tensor& queries_row_splits,
+                      const Tensor&,
+                      const Tensor&,
+                      const Tensor&,
+                      const Metric metric,
+                      Tensor& neighbors_index,
+                      Tensor& neighbors_count,
+                      Tensor& neighbors_distance) {
+    if (metric != Metric::L2) {
+        utility::LogError("SYCL hybrid search only supports L2 metric.");
+    }
+
+    const Device device = points.GetDevice();
+    const int64_t num_queries = queries.GetShape(0);
+    const T threshold = static_cast<T>(radius * radius);
+
+    NeighborSearchAllocator<T, TIndex> output_allocator(device);
+    TIndex* neighbors_index_ptr;
+    T* neighbors_distance_ptr;
+    output_allocator.AllocIndices(&neighbors_index_ptr,
+                                  num_queries * max_knn, TIndex(-1));
+    output_allocator.AllocDistances(&neighbors_distance_ptr,
+                                    num_queries * max_knn, T(0));
+
+    std::vector<TIndex> counts(num_queries, 0);
+    for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
+         ++batch_idx) {
+        const int64_t point_begin = points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end = points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t query_begin =
+                queries_row_splits[batch_idx].Item<int64_t>();
+        const int64_t query_end =
+                queries_row_splits[batch_idx + 1].Item<int64_t>();
+        const Tensor points_i = points.Slice(0, point_begin, point_end);
+
+        for (int64_t q = query_begin; q < query_end; ++q) {
+            const Tensor query = queries.Slice(0, q, q + 1).Flatten();
+            const int64_t num_neighbors =
+                    CountQueryResultsWithinRadiusSYCL<T, TIndex>(
+                            points_i, query, threshold);
+            counts[q] = static_cast<TIndex>(std::min<int64_t>(num_neighbors,
+                                                              max_knn));
+            if (counts[q] == 0) {
+                continue;
+            }
+            GatherQueryResultsWithinRadiusSYCL<T, TIndex>(
+                    points_i, query, counts[q], TIndex(point_begin),
+                    neighbors_index_ptr + q * max_knn,
+                    neighbors_distance_ptr + q * max_knn);
+        }
+    }
+
+    neighbors_index = output_allocator.NeighborsIndex();
+    neighbors_distance = output_allocator.NeighborsDistance();
+    neighbors_count =
+            Tensor(counts, {num_queries}, Dtype::FromType<TIndex>()).To(device);
+}
+
+#define INSTANTIATE(T, TIndex)                                                \
+    template void KnnSearchSYCL<T, TIndex>(                                   \
+            const Tensor& points, const Tensor& points_row_splits,            \
+            const Tensor& queries, const Tensor& queries_row_splits, int knn, \
+            Tensor& neighbors_index, Tensor& neighbors_row_splits,            \
+            Tensor& neighbors_distance);                                      \
+    template void FixedRadiusSearchSYCL<T, TIndex>(                           \
+            const Tensor& points, const Tensor& queries, double radius,       \
+            const Tensor& points_row_splits, const Tensor& queries_row_splits, \
+            const Tensor&, const Tensor&, const Tensor&, const Metric metric, \
+            const bool ignore_query_point, const bool return_distances,       \
+            const bool sort, Tensor& neighbors_index,                         \
+            Tensor& neighbors_row_splits, Tensor& neighbors_distance);        \
+    template void HybridSearchSYCL<T, TIndex>(                                \
+            const Tensor& points, const Tensor& queries, double radius,       \
+            int max_knn, const Tensor& points_row_splits,                     \
+            const Tensor& queries_row_splits, const Tensor&, const Tensor&,   \
+            const Tensor&, const Metric metric, Tensor& neighbors_index,      \
+            Tensor& neighbors_count, Tensor& neighbors_distance);
+
+INSTANTIATE(float, int32_t)
+INSTANTIATE(float, int64_t)
+INSTANTIATE(double, int32_t)
+INSTANTIATE(double, int64_t)
+
+}  // namespace nns
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.cpp b/cpp/open3d/core/nns/NearestNeighborSearch.cpp
index 9cc8030a35b..920971beaed 100644
--- a/cpp/open3d/core/nns/NearestNeighborSearch.cpp
+++ b/cpp/open3d/core/nns/NearestNeighborSearch.cpp
@@ -21,21 +21,18 @@ bool NearestNeighborSearch::SetIndex() {
 };
 
 bool NearestNeighborSearch::KnnIndex() {
-    if (dataset_points_.IsCUDA()) {
-#ifdef BUILD_CUDA_MODULE
+    if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) {
         knn_index_.reset(new nns::KnnIndex());
         return knn_index_->SetTensorData(dataset_points_, index_dtype_);
-#else
-        utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please recompile Open3D with "
-                "-DBUILD_CUDA_MODULE=ON.");
-#endif
     } else {
         return SetIndex();
     }
 };
 
-bool NearestNeighborSearch::MultiRadiusIndex() { return SetIndex(); };
+bool NearestNeighborSearch::MultiRadiusIndex() {
+    AssertCPU(dataset_points_);
+    return SetIndex();
+};
 
 bool NearestNeighborSearch::FixedRadiusIndex(std::optional<double> radius) {
     if (dataset_points_.IsCUDA()) {
@@ -53,6 +50,15 @@ bool NearestNeighborSearch::FixedRadiusIndex(std::optional<double> radius) {
 #endif
 
     } else {
+        if (dataset_points_.IsSYCL()) {
+            if (!radius.has_value()) {
+                utility::LogError("radius is required for SYCL FixedRadiusIndex.");
+            }
+            fixed_radius_index_.reset(new nns::FixedRadiusIndex());
+            return fixed_radius_index_->SetTensorData(dataset_points_,
+                                                      radius.value(),
+                                                      index_dtype_);
+        }
         return SetIndex();
     }
 }
@@ -72,6 +78,15 @@ bool NearestNeighborSearch::HybridIndex(std::optional<double> radius) {
 #endif
 
     } else {
+        if (dataset_points_.IsSYCL()) {
+            if (!radius.has_value()) {
+                utility::LogError("radius is required for SYCL HybridIndex.");
+            }
+            fixed_radius_index_.reset(new nns::FixedRadiusIndex());
+            return fixed_radius_index_->SetTensorData(dataset_points_,
+                                                      radius.value(),
+                                                      index_dtype_);
+        }
         return SetIndex();
     }
 };
@@ -80,7 +95,7 @@ std::pair<Tensor, Tensor> NearestNeighborSearch::KnnSearch(
         const Tensor& query_points, int knn) {
     AssertTensorDevice(query_points, dataset_points_.GetDevice());
 
-    if (dataset_points_.IsCUDA()) {
+    if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) {
         if (knn_index_) {
             return knn_index_->SearchKnn(query_points, knn);
         } else {
@@ -99,7 +114,7 @@ std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::FixedRadiusSearch(
         const Tensor& query_points, double radius, bool sort) {
     AssertTensorDevice(query_points, dataset_points_.GetDevice());
 
-    if (dataset_points_.IsCUDA()) {
+    if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) {
         if (fixed_radius_index_) {
             return fixed_radius_index_->SearchRadius(query_points, radius,
                                                      sort);
@@ -117,7 +132,7 @@ std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::FixedRadiusSearch(
 
 std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::MultiRadiusSearch(
         const Tensor& query_points, const Tensor& radii) {
-    AssertNotCUDA(query_points);
+    AssertCPU(query_points);
     AssertTensorDtype(query_points, dataset_points_.GetDtype());
     AssertTensorDtype(radii, dataset_points_.GetDtype());
 
@@ -133,7 +148,7 @@ std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::HybridSearch(
         const int max_knn) const {
     AssertTensorDevice(query_points, dataset_points_.GetDevice());
 
-    if (dataset_points_.IsCUDA()) {
+    if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) {
         if (fixed_radius_index_) {
             return fixed_radius_index_->SearchHybrid(query_points, radius,
                                                      max_knn);
@@ -150,11 +165,11 @@ std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::HybridSearch(
     }
 }
 
-void NearestNeighborSearch::AssertNotCUDA(const Tensor& t) const {
-    if (t.IsCUDA()) {
+void NearestNeighborSearch::AssertCPU(const Tensor& t) const {
+    if (!t.IsCPU()) {
         utility::LogError(
-                "TODO: NearestNeighborSearch does not support CUDA tensor "
-                "yet.");
+                "NearestNeighborSearch only supports CPU tensors for this "
+                "operation.");
     }
 }
 
diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.h b/cpp/open3d/core/nns/NearestNeighborSearch.h
index b9fe14ad1ed..aa22ca352b5 100644
--- a/cpp/open3d/core/nns/NearestNeighborSearch.h
+++ b/cpp/open3d/core/nns/NearestNeighborSearch.h
@@ -27,14 +27,12 @@ class NearestNeighborSearch {
     /// Constructor.
     ///
     /// \param dataset_points Dataset points for constructing search index. Must
-    /// be 2D, with shape {n, d}. SYCL tensors are not yet supported.
+    /// be 2D, with shape {n, d}.
     // NearestNeighborSearch(const Tensor &dataset_points)
     //     : dataset_points_(dataset_points){};
     NearestNeighborSearch(const Tensor &dataset_points,
                           const Dtype &index_dtype = core::Int32)
-        : dataset_points_(dataset_points), index_dtype_(index_dtype) {
-        AssertNotSYCL(dataset_points_);
-    };
+        : dataset_points_(dataset_points), index_dtype_(index_dtype) {}
     ~NearestNeighborSearch();
     NearestNeighborSearch(const NearestNeighborSearch &) = delete;
     NearestNeighborSearch &operator=(const NearestNeighborSearch &) = delete;
@@ -47,17 +45,22 @@ class NearestNeighborSearch {
 
     /// Set index for multi-radius search.
     ///
+    /// Multi-radius search is currently only supported for CPU tensors.
+    ///
     /// \return Returns true if building index success, otherwise false.
     bool MultiRadiusIndex();
 
     /// Set index for fixed-radius search.
     ///
-    /// \param radius optional radius parameter. required for gpu fixed radius
-    /// index. \return Returns true if building index success, otherwise false.
+    /// \param radius Optional radius parameter. Required for CUDA and SYCL
+    /// fixed radius indices.
+    /// \return Returns true if building index success, otherwise false.
     bool FixedRadiusIndex(std::optional<double> radius = {});
 
     /// Set index for hybrid search.
     ///
+    /// \param radius Optional radius parameter. Required for CUDA and SYCL
+    /// hybrid indices.
     /// \return Returns true if building index success, otherwise false.
     bool HybridIndex(std::optional<double> radius = {});
 
@@ -88,6 +91,7 @@ class NearestNeighborSearch {
             const Tensor &query_points, double radius, bool sort = true);
 
     /// Perform multi-radius search. Each query point has an independent radius.
+    /// Only CPU tensors are currently supported for this search mode.
     ///
     /// \param query_points Query points. Must be 2D, with shape {n, d}.
     /// \param radii Radii of query points. Each query point has one radius.
@@ -121,8 +125,8 @@ class NearestNeighborSearch {
 private:
     bool SetIndex();
 
-    /// Assert a Tensor is not CUDA tensoer. This will be removed in the future.
-    void AssertNotCUDA(const Tensor &t) const;
+    /// Assert a Tensor is on CPU.
+    void AssertCPU(const Tensor &t) const;
 
 protected:
     std::unique_ptr<NanoFlannIndex> nanoflann_index_;
diff --git a/cpp/pybind/core/nns/nearest_neighbor_search.cpp b/cpp/pybind/core/nns/nearest_neighbor_search.cpp
index 3bb647592a0..99394d8681a 100644
--- a/cpp/pybind/core/nns/nearest_neighbor_search.cpp
+++ b/cpp/pybind/core/nns/nearest_neighbor_search.cpp
@@ -84,7 +84,7 @@ This function needs to be called once before performing search operations.
 
 Args:
     radius (float, optional): Radius value for fixed-radius search. Required
-        for GPU fixed radius index.
+        for CUDA and SYCL fixed radius indices.
 
 Returns:
     True on success.
@@ -115,7 +115,7 @@ This function needs to be called once before performing search operations.
 
 Args:
     radius (float, optional): Radius value for hybrid search. Required
-        for GPU hybrid index.
+        for CUDA and SYCL hybrid indices.
 
 Returns:
     True on success.
@@ -130,7 +130,8 @@ This function needs to be called once before performing search operations.
     To use knn_search initialize the index using knn_index before calling this function.
 
 Args:
-    query_points (open3d.core.Tensor): Query points with shape {n, d}.
+    query_points (open3d.core.Tensor): Query points with shape {n, d}. CPU,
+        CUDA, and SYCL devices are supported.
     knn (int): Number of neighbors to search per query point.
 
 Example:
@@ -179,6 +180,7 @@ This function needs to be called once before performing search operations.
 
 Args:
         query_points (open3d.core.Tensor): Query points with shape {n, d}.
+            CPU, CUDA, and SYCL devices are supported.
         radius (float): Radius value for fixed-radius search. Note that this
             parameter can differ from the radius used to initialize the index
             for convenience, which may cause the index to be rebuilt for GPU 
@@ -234,7 +236,8 @@ This function needs to be called once before performing search operations.
     To use multi_radius_search initialize the index using multi_radius_index before calling this function.
 
 Args:
-    query_points (open3d.core.Tensor): Query points with shape {n, d}.
+    query_points (open3d.core.Tensor): Query points with shape {n, d}. Only
+        CPU tensors are supported.
     radii (open3d.core.Tensor): Radii of query points. Each query point has one radius.
 
 Returns:
@@ -292,7 +295,8 @@ Hybrid search behaves similarly to fixed-radius search, but with a maximum numbe
     To use hybrid_search initialize the index using hybrid_index before calling this function.
 
 Args:
-    query_points (open3d.core.Tensor): Query points with shape {n, d}.
+    query_points (open3d.core.Tensor): Query points with shape {n, d}. CPU,
+        CUDA, and SYCL devices are supported.
     radius (float): Radius value for hybrid search.
     max_knn (int): Maximum number of neighbor to search per query.
 
diff --git a/cpp/tests/core/NearestNeighborSearch.cpp b/cpp/tests/core/NearestNeighborSearch.cpp
index d282e8ecd9a..547d3829a06 100644
--- a/cpp/tests/core/NearestNeighborSearch.cpp
+++ b/cpp/tests/core/NearestNeighborSearch.cpp
@@ -22,7 +22,7 @@
 namespace open3d {
 namespace tests {
 
-class NNSPermuteDevices : public PermuteDevices {};
+class NNSPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(NearestNeighborSearch,
                          NNSPermuteDevices,
                          testing::ValuesIn(PermuteDevices::TestCases()));
diff --git a/python/test/core/test_nn.py b/python/test/core/test_nn.py
index 48b7d2682cd..b5376fec105 100644
--- a/python/test/core/test_nn.py
+++ b/python/test/core/test_nn.py
@@ -19,7 +19,7 @@
 np.random.seed(0)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_knn_index(device):
     dtype = o3c.float32
 
@@ -34,7 +34,7 @@ def test_knn_index(device):
         assert nns.multi_radius_index()
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_knn_search(device):
     dtype = o3c.float32
 
@@ -107,7 +107,7 @@ def test_knn_search(device):
                                rtol=1e-5)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64])
 def test_fixed_radius_search(device, dtype):
     dataset_points = o3c.Tensor(

From 26e069761be69d3255ca51bb262c5b64e964fc06 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 07:32:25 +0000
Subject: [PATCH 11/14] Polish SYCL nearest-neighbor messages

Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com>
---
 cpp/open3d/core/nns/FixedRadiusIndex.cpp | 6 +++---
 cpp/open3d/core/nns/KnnIndex.cpp         | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.cpp b/cpp/open3d/core/nns/FixedRadiusIndex.cpp
index b1254c396e1..a7316b1c8ff 100644
--- a/cpp/open3d/core/nns/FixedRadiusIndex.cpp
+++ b/cpp/open3d/core/nns/FixedRadiusIndex.cpp
@@ -104,7 +104,7 @@ bool FixedRadiusIndex::SetTensorData(const Tensor &dataset_points,
         CALL_BUILD(double, BuildSpatialHashTableCUDA)
 #else
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
+                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with "
                 "-DBUILD_CUDA_MODULE=ON.");
 #endif
     } else if (device.IsSYCL()) {
@@ -179,7 +179,7 @@ std::tuple<Tensor, Tensor, Tensor> FixedRadiusIndex::SearchRadius(
         });
 #else
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
+                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with "
                 "-DBUILD_CUDA_MODULE=ON.");
 #endif
     } else if (device.IsSYCL()) {
@@ -258,7 +258,7 @@ std::tuple<Tensor, Tensor, Tensor> FixedRadiusIndex::SearchHybrid(
         });
 #else
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
+                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with "
                 "-DBUILD_CUDA_MODULE=ON.");
 #endif
     } else if (device.IsSYCL()) {
diff --git a/cpp/open3d/core/nns/KnnIndex.cpp b/cpp/open3d/core/nns/KnnIndex.cpp
index 596ad92d69d..a2c24233a2e 100644
--- a/cpp/open3d/core/nns/KnnIndex.cpp
+++ b/cpp/open3d/core/nns/KnnIndex.cpp
@@ -135,7 +135,7 @@ std::pair<Tensor, Tensor> KnnIndex::SearchKnn(const Tensor& query_points,
         });
 #else
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
+                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with "
                 "-DBUILD_CUDA_MODULE=ON.");
 #endif
     } else if (device.IsSYCL()) {

From e10955f817908d7606f8a4a360f324fb1435c854 Mon Sep 17 00:00:00 2001
From: Sameer Sheorey <sameer.sheorey@intel.com>
Date: Sun, 15 Mar 2026 23:16:09 -0700
Subject: [PATCH 12/14] Optimized implementation with tiles.

---
 cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp | 901 +++++++++++++++++------
 cpp/tests/core/CMakeLists.txt            |  19 +-
 cpp/tests/core/KnnIndex.cpp              |  34 +-
 3 files changed, 698 insertions(+), 256 deletions(-)

diff --git a/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp
index 5bb4f2d02a7..2d20e4b86fa 100644
--- a/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp
+++ b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp
@@ -6,14 +6,14 @@
 // ----------------------------------------------------------------------------
 
 #include <algorithm>
+#include <limits>
 #include <oneapi/dpl/algorithm>
 #include <oneapi/dpl/execution>
 #include <sycl/sycl.hpp>
-#include <vector>
 
-#include "open3d/core/MemoryManager.h"
 #include "open3d/core/SYCLContext.h"
 #include "open3d/core/Tensor.h"
+#include "open3d/core/linalg/AddMM.h"
 #include "open3d/core/nns/FixedRadiusIndex.h"
 #include "open3d/core/nns/KnnIndex.h"
 #include "open3d/core/nns/NeighborSearchAllocator.h"
@@ -24,8 +24,27 @@ namespace open3d {
 namespace core {
 namespace nns {
 
+// SYCL nearest-neighbor search uses tiled distance evaluation with AddMM,
+// batched threshold/count helpers, and query-local top-k selection/merge.
 namespace {
 
+// Chooses a query/point tile shape that limits temporary distance storage.
+inline void ChooseTileSizeSYCL(int64_t num_queries,
+                               int64_t num_points,
+                               int64_t element_size,
+                               int64_t& tile_queries,
+                               int64_t& tile_points) {
+    constexpr int64_t kTargetTileBytes = 8 * 1024 * 1024;
+    tile_queries = std::min<int64_t>(num_queries, 128);
+    tile_queries = std::max<int64_t>(tile_queries, 1);
+    tile_points = std::max<int64_t>(
+            kTargetTileBytes / (tile_queries * element_size), int64_t(256));
+    tile_points = std::min<int64_t>(tile_points, num_points);
+    tile_points = std::max<int64_t>(tile_points, 1);
+}
+
+// Computes squared L2 distances from one query to all points with a SYCL
+// kernel. This path is still used by the small single-row helpers below.
 template <typename T>
 void ComputeSquaredDistancesSYCL(const Device& device,
                                  const T* points_ptr,
@@ -40,199 +59,323 @@ void ComputeSquaredDistancesSYCL(const Device& device,
     constexpr int64_t kWorkGroupSize = 128;
     const int64_t global_size =
             utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize;
-    sycl::queue queue =
-            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
 
     queue.submit([&](sycl::handler& cgh) {
-             sycl::local_accessor<T, 1> local_query(
-                     sycl::range<1>(dimension), cgh);
-             cgh.parallel_for(
-                     sycl::nd_range<1>(sycl::range<1>(global_size),
-                                       sycl::range<1>(kWorkGroupSize)),
-                     [=](sycl::nd_item<1> item) {
-                         const int64_t lid = item.get_local_linear_id();
-                         for (int64_t d = lid; d < dimension;
-                              d += item.get_local_range(0)) {
-                             local_query[d] = query_ptr[d];
-                         }
-                         item.barrier(sycl::access::fence_space::local_space);
-
-                         const int64_t point_idx = item.get_global_linear_id();
-                         if (point_idx >= num_points) {
-                             return;
-                         }
-
-                         const T* point = points_ptr + point_idx * dimension;
-                         T dist = 0;
-                         for (int64_t d = 0; d < dimension; ++d) {
-                             const T diff = point[d] - local_query[d];
-                             dist += diff * diff;
-                         }
-                         distances_ptr[point_idx] = dist;
-                     });
-         }).wait_and_throw();
+        sycl::local_accessor<T, 1> local_query(sycl::range<1>(dimension), cgh);
+        cgh.parallel_for(
+                sycl::nd_range<1>(sycl::range<1>(global_size),
+                                  sycl::range<1>(kWorkGroupSize)),
+                [=](sycl::nd_item<1> item) {
+                    const int64_t lid = item.get_local_linear_id();
+                    for (int64_t d = lid; d < dimension;
+                         d += item.get_local_range(0)) {
+                        local_query[d] = query_ptr[d];
+                    }
+                    item.barrier(sycl::access::fence_space::local_space);
+
+                    const int64_t point_idx = item.get_global_linear_id();
+                    if (point_idx >= num_points) {
+                        return;
+                    }
+
+                    const T* point = points_ptr + point_idx * dimension;
+                    T dist = 0;
+                    for (int64_t d = 0; d < dimension; ++d) {
+                        const T diff = point[d] - local_query[d];
+                        dist += diff * diff;
+                    }
+                    distances_ptr[point_idx] = dist;
+                });
+    });
 }
 
-template <typename T, typename TIndex>
-void SortIndicesByDistanceSYCL(const Device& device,
-                               const T* distances_ptr,
-                               TIndex* indices_ptr,
-                               int64_t num_points) {
-    if (num_points <= 1) {
+// Counts how many entries per query are within a threshold using one launch per
+// tile, avoiding per-query device algorithm dispatch.
+template <typename T>
+void CountWithinThresholdQueriesSYCL(const Device& device,
+                                     const T* distances_ptr,
+                                     int64_t distance_query_stride,
+                                     int64_t num_queries,
+                                     int64_t num_points,
+                                     T threshold,
+                                     int64_t* counts_ptr) {
+    if (num_queries == 0 || num_points == 0) {
         return;
     }
-    sycl::queue queue =
-            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    auto policy = oneapi::dpl::execution::make_device_policy(queue);
-    std::sort(policy, indices_ptr, indices_ptr + num_points,
-              [distances_ptr](TIndex lhs, TIndex rhs) {
-                  const T lhs_dist = distances_ptr[lhs];
-                  const T rhs_dist = distances_ptr[rhs];
-                  if (lhs_dist < rhs_dist) {
-                      return true;
-                  }
-                  if (rhs_dist < lhs_dist) {
-                      return false;
-                  }
-                  return lhs < rhs;
-              });
-    queue.wait_and_throw();
+
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) {
+        const int64_t query_idx = id[0];
+        int64_t local_count = 0;
+        const T* query_distances =
+                distances_ptr + query_idx * distance_query_stride;
+        for (int64_t point_idx = 0; point_idx < num_points; ++point_idx) {
+            if (query_distances[point_idx] <= threshold) {
+                ++local_count;
+            }
+        }
+        counts_ptr[query_idx] += local_count;
+    });
 }
 
+// Appends per-query threshold matches into the final neighbor buffers using
+// caller-provided query offsets.
 template <typename T, typename TIndex>
-int64_t CountWithinRadiusSYCL(const Device& device,
-                              const T* distances_ptr,
-                              const TIndex* sorted_indices_ptr,
-                              int64_t num_points,
-                              T threshold) {
-    if (num_points == 0) {
-        return 0;
+void GatherWithinThresholdQueriesSYCL(const Device& device,
+                                      const T* distances_ptr,
+                                      int64_t distance_query_stride,
+                                      int64_t num_queries,
+                                      int64_t num_points,
+                                      T threshold,
+                                      TIndex index_offset,
+                                      int64_t* offsets_ptr,
+                                      TIndex* out_indices_ptr,
+                                      T* out_distances_ptr) {
+    if (num_queries == 0 || num_points == 0) {
+        return;
     }
 
-    constexpr int64_t kWorkGroupSize = 128;
-    const int64_t global_size =
-            utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize;
-    Tensor count = Tensor::Zeros({1}, Int64, device);
-    int64_t* count_ptr = count.GetDataPtr<int64_t>();
-    sycl::queue queue =
-            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-
-    queue.submit([&](sycl::handler& cgh) {
-             cgh.parallel_for(
-                     sycl::nd_range<1>(sycl::range<1>(global_size),
-                                       sycl::range<1>(kWorkGroupSize)),
-                     [=](sycl::nd_item<1> item) {
-                         const int64_t idx = item.get_global_linear_id();
-                         int local_count = 0;
-                         if (idx < num_points) {
-                             local_count =
-                                     distances_ptr[sorted_indices_ptr[idx]] <=
-                                             threshold
-                                     ? 1
-                                     : 0;
-                         }
-                         auto group = item.get_group();
-                         const int group_count = sycl::reduce_over_group(
-                                 group, local_count, sycl::plus<int>());
-                         if (item.get_local_linear_id() == 0 &&
-                             group_count > 0) {
-                             sycl::atomic_ref<int64_t,
-                                              sycl::memory_order::relaxed,
-                                              sycl::memory_scope::device>
-                                     count_ref(*count_ptr);
-                             count_ref += group_count;
-                         }
-                     });
-         }).wait_and_throw();
-
-    return count.To(Device("CPU:0")).Item<int64_t>();
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) {
+        const int64_t query_idx = id[0];
+        const T* query_distances =
+                distances_ptr + query_idx * distance_query_stride;
+        int64_t offset = offsets_ptr[query_idx];
+        for (int64_t point_idx = 0; point_idx < num_points; ++point_idx) {
+            const T dist = query_distances[point_idx];
+            if (dist <= threshold) {
+                out_indices_ptr[offset] = index_offset + point_idx;
+                if (out_distances_ptr != nullptr) {
+                    out_distances_ptr[offset] = dist;
+                }
+                ++offset;
+            }
+        }
+        offsets_ptr[query_idx] = offset;
+    });
 }
 
+// Selects the smallest k distances per query with OneDPL partial_sort while
+// reusing scratch index buffers allocated by the caller.
 template <typename T, typename TIndex>
-void GatherResultsSYCL(const Device& device,
-                       const T* distances_ptr,
-                       const TIndex* sorted_indices_ptr,
-                       int64_t num_results,
-                       TIndex index_offset,
-                       TIndex* out_indices_ptr,
-                       T* out_distances_ptr) {
-    if (num_results == 0) {
+void SelectTopKQueriesSYCL(const Device& device,
+                           const T* distances_ptr,
+                           int64_t distance_query_stride,
+                           int64_t num_queries,
+                           int64_t num_points,
+                           int64_t knn,
+                           TIndex index_offset,
+                           TIndex* scratch_indices_ptr,
+                           int64_t scratch_query_stride,
+                           TIndex* out_indices_ptr,
+                           T* out_distances_ptr,
+                           int64_t out_query_stride,
+                           bool use_threshold = false,
+                           T threshold = T(0)) {
+    if (num_queries == 0 || num_points == 0 || knn <= 0) {
         return;
     }
 
-    sycl::queue queue =
-            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
-    queue.parallel_for(sycl::range<1>(num_results), [=](sycl::id<1> id) {
-             const int64_t i = id[0];
-             const TIndex point_idx = sorted_indices_ptr[i];
-             out_indices_ptr[i] = point_idx + index_offset;
-             if (out_distances_ptr != nullptr) {
-                 out_distances_ptr[i] = distances_ptr[point_idx];
-             }
-         }).wait_and_throw();
-}
+    const T inf = std::numeric_limits<T>::max();
+    const int64_t actual_knn = std::min(knn, num_points);
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto policy = oneapi::dpl::execution::make_device_policy(queue);
 
-template <typename T, typename TIndex>
-void ProcessQuerySYCL(const Tensor& points,
-                      const Tensor& query,
-                      int64_t num_results,
-                      TIndex index_offset,
-                      TIndex* out_indices_ptr,
-                      T* out_distances_ptr) {
-    const Device device = points.GetDevice();
-    const int64_t num_points = points.GetShape(0);
-
-    Tensor distances = Tensor::Empty({num_points}, points.GetDtype(), device);
-    Tensor sorted_indices = Tensor::Arange(
-            0, num_points, 1, Dtype::FromType<TIndex>(), device);
-
-    ComputeSquaredDistancesSYCL<T>(device, points.GetDataPtr<T>(), num_points,
-                                   points.GetShape(1), query.GetDataPtr<T>(),
-                                   distances.GetDataPtr<T>());
-    SortIndicesByDistanceSYCL<T, TIndex>(device, distances.GetDataPtr<T>(),
-                                         sorted_indices.GetDataPtr<TIndex>(),
-                                         num_points);
-    GatherResultsSYCL<T, TIndex>(device, distances.GetDataPtr<T>(),
-                                 sorted_indices.GetDataPtr<TIndex>(),
-                                 num_results, index_offset, out_indices_ptr,
-                                 out_distances_ptr);
+    queue.parallel_for(
+            sycl::range<2>(num_queries, num_points), [=](sycl::id<2> id) {
+                scratch_indices_ptr[id[0] * scratch_query_stride + id[1]] =
+                        static_cast<TIndex>(id[1]);
+            });
+
+    for (int64_t query_idx = 0; query_idx < num_queries; ++query_idx) {
+        TIndex* query_indices =
+                scratch_indices_ptr + query_idx * scratch_query_stride;
+        const T* query_distances =
+                distances_ptr + query_idx * distance_query_stride;
+        std::partial_sort(policy, query_indices, query_indices + actual_knn,
+                          query_indices + num_points,
+                          [query_distances](TIndex lhs, TIndex rhs) {
+                              const T lhs_dist = query_distances[lhs];
+                              const T rhs_dist = query_distances[rhs];
+                              if (lhs_dist < rhs_dist) {
+                                  return true;
+                              }
+                              if (rhs_dist < lhs_dist) {
+                                  return false;
+                              }
+                              return lhs < rhs;
+                          });
+    }
+
+    queue.parallel_for(sycl::range<2>(num_queries, knn), [=](sycl::id<2> id) {
+        const int64_t query_idx = id[0];
+        const int64_t k = id[1];
+        TIndex* query_out_indices =
+                out_indices_ptr + query_idx * out_query_stride;
+        T* query_out_distances =
+                out_distances_ptr + query_idx * out_query_stride;
+
+        if (k >= actual_knn) {
+            query_out_indices[k] = TIndex(-1);
+            query_out_distances[k] = inf;
+            return;
+        }
+
+        const TIndex local_idx =
+                scratch_indices_ptr[query_idx * scratch_query_stride + k];
+        const T dist =
+                distances_ptr[query_idx * distance_query_stride + local_idx];
+        if (use_threshold && dist > threshold) {
+            query_out_indices[k] = TIndex(-1);
+            query_out_distances[k] = inf;
+            return;
+        }
+        query_out_indices[k] = index_offset + local_idx;
+        query_out_distances[k] = dist;
+    });
 }
 
+// Merges two sorted top-k query buffers into a new top-k query buffer with
+// OneDPL partial_sort over 2k candidates.
 template <typename T, typename TIndex>
-int64_t CountQueryResultsWithinRadiusSYCL(const Tensor& points,
-                                          const Tensor& query,
-                                          T threshold) {
-    const Device device = points.GetDevice();
-    const int64_t num_points = points.GetShape(0);
-
-    Tensor distances = Tensor::Empty({num_points}, points.GetDtype(), device);
-    Tensor sorted_indices = Tensor::Arange(
-            0, num_points, 1, Dtype::FromType<TIndex>(), device);
-
-    ComputeSquaredDistancesSYCL<T>(device, points.GetDataPtr<T>(), num_points,
-                                   points.GetShape(1), query.GetDataPtr<T>(),
-                                   distances.GetDataPtr<T>());
-    SortIndicesByDistanceSYCL<T, TIndex>(device, distances.GetDataPtr<T>(),
-                                         sorted_indices.GetDataPtr<TIndex>(),
-                                         num_points);
-    return CountWithinRadiusSYCL<T, TIndex>(
-            device, distances.GetDataPtr<T>(), sorted_indices.GetDataPtr<TIndex>(),
-            num_points, threshold);
+void MergeTopKQueriesSYCL(const Device& device,
+                          const T* current_distances_ptr,
+                          const TIndex* current_indices_ptr,
+                          int64_t current_query_stride,
+                          const T* candidate_distances_ptr,
+                          const TIndex* candidate_indices_ptr,
+                          int64_t candidate_query_stride,
+                          int64_t num_queries,
+                          int64_t knn,
+                          TIndex* scratch_indices_ptr,
+                          int64_t scratch_query_stride,
+                          TIndex* out_indices_ptr,
+                          T* out_distances_ptr,
+                          int64_t out_query_stride) {
+    if (num_queries == 0 || knn <= 0) {
+        return;
+    }
+
+    const T inf = std::numeric_limits<T>::max();
+    const int64_t combined_cols = 2 * knn;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto policy = oneapi::dpl::execution::make_device_policy(queue);
+
+    queue.parallel_for(
+            sycl::range<2>(num_queries, combined_cols), [=](sycl::id<2> id) {
+                scratch_indices_ptr[id[0] * scratch_query_stride + id[1]] =
+                        static_cast<TIndex>(id[1]);
+            });
+
+    for (int64_t query_idx = 0; query_idx < num_queries; ++query_idx) {
+        TIndex* query_indices =
+                scratch_indices_ptr + query_idx * scratch_query_stride;
+        const T* query_current_dist =
+                current_distances_ptr + query_idx * current_query_stride;
+        const TIndex* query_current_idx =
+                current_indices_ptr + query_idx * current_query_stride;
+        const T* query_candidate_dist =
+                candidate_distances_ptr + query_idx * candidate_query_stride;
+        const TIndex* query_candidate_idx =
+                candidate_indices_ptr + query_idx * candidate_query_stride;
+
+        std::partial_sort(
+                policy, query_indices, query_indices + knn,
+                query_indices + combined_cols,
+                [query_current_dist, query_current_idx, query_candidate_dist,
+                 query_candidate_idx, knn](TIndex lhs, TIndex rhs) {
+                    const bool lhs_is_current = lhs < knn;
+                    const bool rhs_is_current = rhs < knn;
+                    const TIndex lhs_idx =
+                            lhs_is_current ? query_current_idx[lhs]
+                                           : query_candidate_idx[lhs - knn];
+                    const TIndex rhs_idx =
+                            rhs_is_current ? query_current_idx[rhs]
+                                           : query_candidate_idx[rhs - knn];
+                    const T lhs_dist =
+                            lhs_is_current ? query_current_dist[lhs]
+                                           : query_candidate_dist[lhs - knn];
+                    const T rhs_dist =
+                            rhs_is_current ? query_current_dist[rhs]
+                                           : query_candidate_dist[rhs - knn];
+                    const bool lhs_valid = lhs_idx >= 0;
+                    const bool rhs_valid = rhs_idx >= 0;
+                    if (lhs_valid != rhs_valid) {
+                        return lhs_valid;
+                    }
+                    if (lhs_dist < rhs_dist) {
+                        return true;
+                    }
+                    if (rhs_dist < lhs_dist) {
+                        return false;
+                    }
+                    return lhs_idx < rhs_idx;
+                });
+    }
+
+    queue.parallel_for(sycl::range<2>(num_queries, knn), [=](sycl::id<2> id) {
+        const int64_t query_idx = id[0];
+        const int64_t k = id[1];
+        TIndex* query_out_indices =
+                out_indices_ptr + query_idx * out_query_stride;
+        T* query_out_distances =
+                out_distances_ptr + query_idx * out_query_stride;
+        const TIndex source =
+                scratch_indices_ptr[query_idx * scratch_query_stride + k];
+        const bool is_current = source < knn;
+        const int64_t offset = is_current ? source : source - knn;
+        const TIndex idx =
+                is_current
+                        ? current_indices_ptr[query_idx * current_query_stride +
+                                              offset]
+                        : candidate_indices_ptr[query_idx *
+                                                        candidate_query_stride +
+                                                offset];
+        const T dist =
+                is_current
+                        ? current_distances_ptr[query_idx *
+                                                        current_query_stride +
+                                                offset]
+                        : candidate_distances_ptr
+                                  [query_idx * candidate_query_stride + offset];
+        if (idx < 0) {
+            query_out_indices[k] = TIndex(-1);
+            query_out_distances[k] = inf;
+        } else {
+            query_out_indices[k] = idx;
+            query_out_distances[k] = dist;
+        }
+    });
 }
 
+// Finalizes hybrid-search counts and clears any unused tail elements in the
+// fixed-width output tensors.
 template <typename T, typename TIndex>
-void GatherQueryResultsWithinRadiusSYCL(const Tensor& points,
-                                        const Tensor& query,
-                                        int64_t num_results,
-                                        TIndex index_offset,
-                                        TIndex* out_indices_ptr,
-                                        T* out_distances_ptr) {
-    ProcessQuerySYCL<T, TIndex>(points, query, num_results, index_offset,
-                                out_indices_ptr, out_distances_ptr);
+void FinalizeHybridResultsSYCL(const Device& device,
+                               const int64_t* counts_ptr,
+                               int64_t num_queries,
+                               int64_t max_knn,
+                               TIndex* neighbors_index_ptr,
+                               T* neighbors_distance_ptr,
+                               TIndex* neighbors_count_ptr) {
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) {
+        const int64_t query_idx = id[0];
+        const int64_t clipped_count =
+                std::min<int64_t>(counts_ptr[query_idx], max_knn);
+        neighbors_count_ptr[query_idx] = static_cast<TIndex>(clipped_count);
+        for (int64_t k = clipped_count; k < max_knn; ++k) {
+            neighbors_index_ptr[query_idx * max_knn + k] = TIndex(-1);
+            neighbors_distance_ptr[query_idx * max_knn + k] = T(0);
+        }
+    });
 }
 
 }  // namespace
 
+// Batched KNN search with tiled AddMM distance evaluation and query-local top-k
+// selection/merge on each tile.
 template <class T, class TIndex>
 void KnnSearchSYCL(const Tensor& points,
                    const Tensor& points_row_splits,
@@ -243,6 +386,9 @@ void KnnSearchSYCL(const Tensor& points,
                    Tensor& neighbors_row_splits,
                    Tensor& neighbors_distance) {
     const Device device = points.GetDevice();
+    const Dtype dtype = points.GetDtype();
+    const Dtype index_dtype = Dtype::FromType<TIndex>();
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
     const int batch_size = points_row_splits.GetShape(0) - 1;
     std::vector<NeighborSearchAllocator<T, TIndex>> batch_output_allocators(
             batch_size, NeighborSearchAllocator<T, TIndex>(device));
@@ -253,8 +399,10 @@ void KnnSearchSYCL(const Tensor& points,
     int64_t batch_knn = 0;
 
     for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
-        const int64_t point_begin = points_row_splits[batch_idx].Item<int64_t>();
-        const int64_t point_end = points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t point_begin =
+                points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end =
+                points_row_splits[batch_idx + 1].Item<int64_t>();
         const int64_t query_begin =
                 queries_row_splits[batch_idx].Item<int64_t>();
         const int64_t query_end =
@@ -280,20 +428,119 @@ void KnnSearchSYCL(const Tensor& points,
         batch_output_allocators[batch_idx].AllocDistances(
                 &distances_ptr, num_queries_i * batch_knn, T(0));
 
-        for (int64_t q = 0; q < num_queries_i; ++q) {
-            const Tensor query = queries_i.Slice(0, q, q + 1).Flatten();
-            ProcessQuerySYCL<T, TIndex>(
-                    points_i, query, batch_knn, TIndex(point_begin),
-                    indices_ptr + q * batch_knn, distances_ptr + q * batch_knn);
+        Tensor out_indices =
+                batch_output_allocators[batch_idx].NeighborsIndex().View(
+                        {num_queries_i, batch_knn});
+        Tensor out_distances =
+                batch_output_allocators[batch_idx].NeighborsDistance().View(
+                        {num_queries_i, batch_knn});
+
+        // Precompute norms for batched distance evaluation.
+        Tensor point_norms = points_i.Mul(points_i).Sum({1});
+        Tensor query_norms = queries_i.Mul(queries_i).Sum({1});
+
+        int64_t tile_queries = 0;
+        int64_t tile_points = 0;
+        ChooseTileSizeSYCL(num_queries_i, num_points_i, sizeof(T), tile_queries,
+                           tile_points);
+
+        Tensor temp_distances =
+                Tensor::Empty({tile_queries, tile_points}, dtype, device);
+        Tensor tile_sort_indices =
+                Tensor::Empty({tile_queries, tile_points}, index_dtype, device);
+        Tensor tile_top_indices =
+                Tensor::Empty({tile_queries, batch_knn}, index_dtype, device);
+        Tensor tile_top_distances =
+                Tensor::Empty({tile_queries, batch_knn}, dtype, device);
+        Tensor best_indices =
+                Tensor::Empty({tile_queries, batch_knn}, index_dtype, device);
+        Tensor best_distances =
+                Tensor::Empty({tile_queries, batch_knn}, dtype, device);
+        Tensor merged_indices =
+                Tensor::Empty({tile_queries, batch_knn}, index_dtype, device);
+        Tensor merged_distances =
+                Tensor::Empty({tile_queries, batch_knn}, dtype, device);
+        Tensor merge_sort_indices = Tensor::Empty({tile_queries, 2 * batch_knn},
+                                                  index_dtype, device);
+
+        for (int64_t q = 0; q < num_queries_i; q += tile_queries) {
+            int64_t num_queries_iter =
+                    std::min(tile_queries, num_queries_i - q);
+            Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter);
+            Tensor query_norms_tile =
+                    query_norms.Slice(0, q, q + num_queries_iter);
+
+            Tensor best_indices_view =
+                    best_indices.Slice(0, 0, num_queries_iter);
+            Tensor best_distances_view =
+                    best_distances.Slice(0, 0, num_queries_iter);
+            best_indices_view.Fill(TIndex(-1));
+            best_distances_view.Fill(std::numeric_limits<T>::max());
+
+            for (int64_t p = 0; p < num_points_i; p += tile_points) {
+                int64_t num_points_iter =
+                        std::min(tile_points, num_points_i - p);
+                Tensor points_tile = points_i.Slice(0, p, p + num_points_iter);
+                Tensor point_norms_tile =
+                        point_norms.Slice(0, p, p + num_points_iter);
+                Tensor temp_distances_view =
+                        temp_distances.Slice(0, 0, num_queries_iter)
+                                .Slice(1, 0, num_points_iter);
+
+                AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0,
+                      0.0);
+                temp_distances_view.Add_(
+                        point_norms_tile.View({1, num_points_iter}));
+                temp_distances_view.Add_(
+                        query_norms_tile.View({num_queries_iter, 1}));
+
+                Tensor tile_top_indices_view =
+                        tile_top_indices.Slice(0, 0, num_queries_iter);
+                Tensor tile_top_distances_view =
+                        tile_top_distances.Slice(0, 0, num_queries_iter);
+                Tensor merged_indices_view =
+                        merged_indices.Slice(0, 0, num_queries_iter);
+                Tensor merged_distances_view =
+                        merged_distances.Slice(0, 0, num_queries_iter);
+
+                SelectTopKQueriesSYCL<T, TIndex>(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, batch_knn, TIndex(p),
+                        tile_sort_indices.GetDataPtr<TIndex>(),
+                        tile_sort_indices.GetStride(0),
+                        tile_top_indices_view.GetDataPtr<TIndex>(),
+                        tile_top_distances_view.GetDataPtr<T>(), batch_knn);
+                MergeTopKQueriesSYCL<T, TIndex>(
+                        device, best_distances_view.GetDataPtr<T>(),
+                        best_indices_view.GetDataPtr<TIndex>(), batch_knn,
+                        tile_top_distances_view.GetDataPtr<T>(),
+                        tile_top_indices_view.GetDataPtr<TIndex>(), batch_knn,
+                        num_queries_iter, batch_knn,
+                        merge_sort_indices.GetDataPtr<TIndex>(),
+                        merge_sort_indices.GetStride(0),
+                        merged_indices_view.GetDataPtr<TIndex>(),
+                        merged_distances_view.GetDataPtr<T>(), batch_knn);
+
+                best_indices_view.AsRvalue() = merged_indices_view;
+                best_distances_view.AsRvalue() = merged_distances_view;
+            }
+
+            out_indices.Slice(0, q, q + num_queries_iter).AsRvalue() =
+                    best_indices_view;
+            out_distances.Slice(0, q, q + num_queries_iter).AsRvalue() =
+                    best_distances_view;
         }
+
+        queue.wait_and_throw();
     }
 
     if (batch_size == 1) {
-        neighbors_index =
-                batch_output_allocators[0].NeighborsIndex().View({queries.GetShape(0),
-                                                                  batch_knn});
-        neighbors_distance = batch_output_allocators[0].NeighborsDistance().View(
+        neighbors_index = batch_output_allocators[0].NeighborsIndex().View(
                 {queries.GetShape(0), batch_knn});
+        neighbors_distance =
+                batch_output_allocators[0].NeighborsDistance().View(
+                        {queries.GetShape(0), batch_knn});
         return;
     }
 
@@ -326,6 +573,8 @@ void KnnSearchSYCL(const Tensor& points,
     neighbors_distance = output_allocator.NeighborsDistance();
 }
 
+// Fixed-radius search using two tiled passes: one to count matches and one to
+// gather indices and optional distances.
 template <class T, class TIndex>
 void FixedRadiusSearchSYCL(const Tensor& points,
                            const Tensor& queries,
@@ -347,38 +596,88 @@ void FixedRadiusSearchSYCL(const Tensor& points,
     }
     if (ignore_query_point) {
         utility::LogError(
-                "SYCL fixed radius search does not support ignore_query_point.");
+                "SYCL fixed radius search does not support "
+                "ignore_query_point.");
     }
 
     const Device device = points.GetDevice();
     const int64_t num_queries = queries.GetShape(0);
+    const Dtype dtype = points.GetDtype();
     const T threshold = static_cast<T>(radius * radius);
-    std::vector<int64_t> counts(num_queries, 0);
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    Tensor counts = Tensor::Zeros({num_queries}, Int64, device);
+
+    int64_t tile_queries = 0;
+    int64_t tile_points = 0;
+    ChooseTileSizeSYCL(num_queries, points.GetShape(0), sizeof(T), tile_queries,
+                       tile_points);
+    Tensor temp_distances =
+            Tensor::Empty({tile_queries, tile_points}, dtype, device);
 
     for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
          ++batch_idx) {
-        const int64_t point_begin = points_row_splits[batch_idx].Item<int64_t>();
-        const int64_t point_end = points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t point_begin =
+                points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end =
+                points_row_splits[batch_idx + 1].Item<int64_t>();
         const int64_t query_begin =
                 queries_row_splits[batch_idx].Item<int64_t>();
         const int64_t query_end =
                 queries_row_splits[batch_idx + 1].Item<int64_t>();
         const Tensor points_i = points.Slice(0, point_begin, point_end);
-
-        for (int64_t q = query_begin; q < query_end; ++q) {
-            const Tensor query = queries.Slice(0, q, q + 1).Flatten();
-            counts[q] = CountQueryResultsWithinRadiusSYCL<T, TIndex>(
-                    points_i, query, threshold);
+        const Tensor queries_i = queries.Slice(0, query_begin, query_end);
+        const int64_t num_points_i = points_i.GetShape(0);
+        const int64_t num_queries_i = queries_i.GetShape(0);
+        Tensor point_norms = points_i.Mul(points_i).Sum({1});
+        Tensor query_norms = queries_i.Mul(queries_i).Sum({1});
+
+        for (int64_t q = 0; q < num_queries_i; q += tile_queries) {
+            const int64_t num_queries_iter =
+                    std::min(tile_queries, num_queries_i - q);
+            Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter);
+            Tensor query_norms_tile =
+                    query_norms.Slice(0, q, q + num_queries_iter);
+            int64_t* counts_ptr =
+                    counts.GetDataPtr<int64_t>() + query_begin + q;
+
+            for (int64_t p = 0; p < num_points_i; p += tile_points) {
+                const int64_t num_points_iter =
+                        std::min(tile_points, num_points_i - p);
+                Tensor points_tile = points_i.Slice(0, p, p + num_points_iter);
+                Tensor point_norms_tile =
+                        point_norms.Slice(0, p, p + num_points_iter);
+                Tensor temp_distances_view =
+                        temp_distances.Slice(0, 0, num_queries_iter)
+                                .Slice(1, 0, num_points_iter);
+
+                AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0,
+                      0.0);
+                temp_distances_view.Add_(
+                        point_norms_tile.View({1, num_points_iter}));
+                temp_distances_view.Add_(
+                        query_norms_tile.View({num_queries_iter, 1}));
+                CountWithinThresholdQueriesSYCL(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, threshold, counts_ptr);
+            }
         }
     }
 
+    queue.wait_and_throw();
+
+    Tensor counts_cpu = counts.To(Device("CPU:0"));
+    const int64_t* counts_cpu_ptr = counts_cpu.GetDataPtr<int64_t>();
     std::vector<int64_t> row_splits(num_queries + 1, 0);
     for (int64_t q = 0; q < num_queries; ++q) {
-        row_splits[q + 1] = row_splits[q] + counts[q];
+        row_splits[q + 1] = row_splits[q] + counts_cpu_ptr[q];
     }
     neighbors_row_splits =
             Tensor(row_splits, {num_queries + 1}, Int64).To(device);
 
+    std::vector<int64_t> row_offsets(row_splits.begin(), row_splits.end() - 1);
+    Tensor write_offsets = Tensor(row_offsets, {num_queries}, Int64).To(device);
+
     NeighborSearchAllocator<T, TIndex> output_allocator(device);
     TIndex* neighbors_index_ptr;
     T* neighbors_distance_ptr;
@@ -392,31 +691,64 @@ void FixedRadiusSearchSYCL(const Tensor& points,
 
     for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
          ++batch_idx) {
-        const int64_t point_begin = points_row_splits[batch_idx].Item<int64_t>();
-        const int64_t point_end = points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t point_begin =
+                points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end =
+                points_row_splits[batch_idx + 1].Item<int64_t>();
         const int64_t query_begin =
                 queries_row_splits[batch_idx].Item<int64_t>();
         const int64_t query_end =
                 queries_row_splits[batch_idx + 1].Item<int64_t>();
         const Tensor points_i = points.Slice(0, point_begin, point_end);
-
-        for (int64_t q = query_begin; q < query_end; ++q) {
-            if (counts[q] == 0) {
-                continue;
+        const Tensor queries_i = queries.Slice(0, query_begin, query_end);
+        const int64_t num_points_i = points_i.GetShape(0);
+        const int64_t num_queries_i = queries_i.GetShape(0);
+        Tensor point_norms = points_i.Mul(points_i).Sum({1});
+        Tensor query_norms = queries_i.Mul(queries_i).Sum({1});
+
+        for (int64_t q = 0; q < num_queries_i; q += tile_queries) {
+            const int64_t num_queries_iter =
+                    std::min(tile_queries, num_queries_i - q);
+            Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter);
+            Tensor query_norms_tile =
+                    query_norms.Slice(0, q, q + num_queries_iter);
+            int64_t* offsets_ptr =
+                    write_offsets.GetDataPtr<int64_t>() + query_begin + q;
+
+            for (int64_t p = 0; p < num_points_i; p += tile_points) {
+                const int64_t num_points_iter =
+                        std::min(tile_points, num_points_i - p);
+                Tensor points_tile = points_i.Slice(0, p, p + num_points_iter);
+                Tensor point_norms_tile =
+                        point_norms.Slice(0, p, p + num_points_iter);
+                Tensor temp_distances_view =
+                        temp_distances.Slice(0, 0, num_queries_iter)
+                                .Slice(1, 0, num_points_iter);
+
+                AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0,
+                      0.0);
+                temp_distances_view.Add_(
+                        point_norms_tile.View({1, num_points_iter}));
+                temp_distances_view.Add_(
+                        query_norms_tile.View({num_queries_iter, 1}));
+                GatherWithinThresholdQueriesSYCL<T, TIndex>(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, threshold, TIndex(point_begin + p),
+                        offsets_ptr, neighbors_index_ptr,
+                        return_distances ? neighbors_distance_ptr : nullptr);
             }
-            const Tensor query = queries.Slice(0, q, q + 1).Flatten();
-            GatherQueryResultsWithinRadiusSYCL<T, TIndex>(
-                    points_i, query, counts[q], TIndex(point_begin),
-                    neighbors_index_ptr + row_splits[q],
-                    return_distances ? neighbors_distance_ptr + row_splits[q]
-                                     : nullptr);
         }
     }
 
+    queue.wait_and_throw();
+
     neighbors_index = output_allocator.NeighborsIndex();
     neighbors_distance = output_allocator.NeighborsDistance();
 }
 
+// Hybrid search counts all neighbors within radius while keeping only the best
+// max_knn results per query in fixed-size output tensors.
 template <class T, class TIndex>
 void HybridSearchSYCL(const Tensor& points,
                       const Tensor& queries,
@@ -437,68 +769,161 @@ void HybridSearchSYCL(const Tensor& points,
 
     const Device device = points.GetDevice();
     const int64_t num_queries = queries.GetShape(0);
+    const Dtype dtype = points.GetDtype();
     const T threshold = static_cast<T>(radius * radius);
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
 
     NeighborSearchAllocator<T, TIndex> output_allocator(device);
     TIndex* neighbors_index_ptr;
     T* neighbors_distance_ptr;
-    output_allocator.AllocIndices(&neighbors_index_ptr,
-                                  num_queries * max_knn, TIndex(-1));
+    output_allocator.AllocIndices(&neighbors_index_ptr, num_queries * max_knn,
+                                  TIndex(-1));
     output_allocator.AllocDistances(&neighbors_distance_ptr,
-                                    num_queries * max_knn, T(0));
+                                    num_queries * max_knn,
+                                    std::numeric_limits<T>::max());
+
+    Tensor counts = Tensor::Zeros({num_queries}, Int64, device);
+    Tensor out_indices =
+            output_allocator.NeighborsIndex().View({num_queries, max_knn});
+    Tensor out_distances =
+            output_allocator.NeighborsDistance().View({num_queries, max_knn});
+
+    int64_t tile_queries = 0;
+    int64_t tile_points = 0;
+    ChooseTileSizeSYCL(num_queries, points.GetShape(0), sizeof(T), tile_queries,
+                       tile_points);
+    Tensor temp_distances =
+            Tensor::Empty({tile_queries, tile_points}, dtype, device);
+    Tensor tile_sort_indices = Tensor::Empty({tile_queries, tile_points},
+                                             Dtype::FromType<TIndex>(), device);
+    Tensor tile_top_indices = Tensor::Empty({tile_queries, max_knn},
+                                            Dtype::FromType<TIndex>(), device);
+    Tensor tile_top_distances =
+            Tensor::Empty({tile_queries, max_knn}, dtype, device);
+    Tensor merged_indices = Tensor::Empty({tile_queries, max_knn},
+                                          Dtype::FromType<TIndex>(), device);
+    Tensor merged_distances =
+            Tensor::Empty({tile_queries, max_knn}, dtype, device);
+    Tensor merge_sort_indices = Tensor::Empty(
+            {tile_queries, 2 * max_knn}, Dtype::FromType<TIndex>(), device);
 
-    std::vector<TIndex> counts(num_queries, 0);
     for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
          ++batch_idx) {
-        const int64_t point_begin = points_row_splits[batch_idx].Item<int64_t>();
-        const int64_t point_end = points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t point_begin =
+                points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end =
+                points_row_splits[batch_idx + 1].Item<int64_t>();
         const int64_t query_begin =
                 queries_row_splits[batch_idx].Item<int64_t>();
         const int64_t query_end =
                 queries_row_splits[batch_idx + 1].Item<int64_t>();
         const Tensor points_i = points.Slice(0, point_begin, point_end);
-
-        for (int64_t q = query_begin; q < query_end; ++q) {
-            const Tensor query = queries.Slice(0, q, q + 1).Flatten();
-            const int64_t num_neighbors =
-                    CountQueryResultsWithinRadiusSYCL<T, TIndex>(
-                            points_i, query, threshold);
-            counts[q] = static_cast<TIndex>(std::min<int64_t>(num_neighbors,
-                                                              max_knn));
-            if (counts[q] == 0) {
-                continue;
+        const Tensor queries_i = queries.Slice(0, query_begin, query_end);
+        const int64_t num_points_i = points_i.GetShape(0);
+        const int64_t num_queries_i = queries_i.GetShape(0);
+        Tensor point_norms = points_i.Mul(points_i).Sum({1});
+        Tensor query_norms = queries_i.Mul(queries_i).Sum({1});
+
+        for (int64_t q = 0; q < num_queries_i; q += tile_queries) {
+            const int64_t num_queries_iter =
+                    std::min(tile_queries, num_queries_i - q);
+            Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter);
+            Tensor query_norms_tile =
+                    query_norms.Slice(0, q, q + num_queries_iter);
+            Tensor best_indices_view = out_indices.Slice(
+                    0, query_begin + q, query_begin + q + num_queries_iter);
+            Tensor best_distances_view = out_distances.Slice(
+                    0, query_begin + q, query_begin + q + num_queries_iter);
+            int64_t* counts_ptr =
+                    counts.GetDataPtr<int64_t>() + query_begin + q;
+
+            for (int64_t p = 0; p < num_points_i; p += tile_points) {
+                const int64_t num_points_iter =
+                        std::min(tile_points, num_points_i - p);
+                Tensor points_tile = points_i.Slice(0, p, p + num_points_iter);
+                Tensor point_norms_tile =
+                        point_norms.Slice(0, p, p + num_points_iter);
+                Tensor temp_distances_view =
+                        temp_distances.Slice(0, 0, num_queries_iter)
+                                .Slice(1, 0, num_points_iter);
+
+                AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0,
+                      0.0);
+                temp_distances_view.Add_(
+                        point_norms_tile.View({1, num_points_iter}));
+                temp_distances_view.Add_(
+                        query_norms_tile.View({num_queries_iter, 1}));
+
+                CountWithinThresholdQueriesSYCL(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, threshold, counts_ptr);
+
+                Tensor tile_top_indices_view =
+                        tile_top_indices.Slice(0, 0, num_queries_iter);
+                Tensor tile_top_distances_view =
+                        tile_top_distances.Slice(0, 0, num_queries_iter);
+                Tensor merged_indices_view =
+                        merged_indices.Slice(0, 0, num_queries_iter);
+                Tensor merged_distances_view =
+                        merged_distances.Slice(0, 0, num_queries_iter);
+
+                SelectTopKQueriesSYCL<T, TIndex>(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, max_knn, TIndex(point_begin + p),
+                        tile_sort_indices.GetDataPtr<TIndex>(),
+                        tile_sort_indices.GetStride(0),
+                        tile_top_indices_view.GetDataPtr<TIndex>(),
+                        tile_top_distances_view.GetDataPtr<T>(), max_knn, true,
+                        threshold);
+                MergeTopKQueriesSYCL<T, TIndex>(
+                        device, best_distances_view.GetDataPtr<T>(),
+                        best_indices_view.GetDataPtr<TIndex>(), max_knn,
+                        tile_top_distances_view.GetDataPtr<T>(),
+                        tile_top_indices_view.GetDataPtr<TIndex>(), max_knn,
+                        num_queries_iter, max_knn,
+                        merge_sort_indices.GetDataPtr<TIndex>(),
+                        merge_sort_indices.GetStride(0),
+                        merged_indices_view.GetDataPtr<TIndex>(),
+                        merged_distances_view.GetDataPtr<T>(), max_knn);
+
+                best_indices_view.AsRvalue() = merged_indices_view;
+                best_distances_view.AsRvalue() = merged_distances_view;
             }
-            GatherQueryResultsWithinRadiusSYCL<T, TIndex>(
-                    points_i, query, counts[q], TIndex(point_begin),
-                    neighbors_index_ptr + q * max_knn,
-                    neighbors_distance_ptr + q * max_knn);
         }
     }
 
     neighbors_index = output_allocator.NeighborsIndex();
     neighbors_distance = output_allocator.NeighborsDistance();
     neighbors_count =
-            Tensor(counts, {num_queries}, Dtype::FromType<TIndex>()).To(device);
+            Tensor::Empty({num_queries}, Dtype::FromType<TIndex>(), device);
+    FinalizeHybridResultsSYCL<T, TIndex>(device, counts.GetDataPtr<int64_t>(),
+                                         num_queries, max_knn,
+                                         neighbors_index.GetDataPtr<TIndex>(),
+                                         neighbors_distance.GetDataPtr<T>(),
+                                         neighbors_count.GetDataPtr<TIndex>());
+    queue.wait_and_throw();
 }
 
-#define INSTANTIATE(T, TIndex)                                                \
-    template void KnnSearchSYCL<T, TIndex>(                                   \
-            const Tensor& points, const Tensor& points_row_splits,            \
-            const Tensor& queries, const Tensor& queries_row_splits, int knn, \
-            Tensor& neighbors_index, Tensor& neighbors_row_splits,            \
-            Tensor& neighbors_distance);                                      \
-    template void FixedRadiusSearchSYCL<T, TIndex>(                           \
-            const Tensor& points, const Tensor& queries, double radius,       \
+#define INSTANTIATE(T, TIndex)                                                 \
+    template void KnnSearchSYCL<T, TIndex>(                                    \
+            const Tensor& points, const Tensor& points_row_splits,             \
+            const Tensor& queries, const Tensor& queries_row_splits, int knn,  \
+            Tensor& neighbors_index, Tensor& neighbors_row_splits,             \
+            Tensor& neighbors_distance);                                       \
+    template void FixedRadiusSearchSYCL<T, TIndex>(                            \
+            const Tensor& points, const Tensor& queries, double radius,        \
             const Tensor& points_row_splits, const Tensor& queries_row_splits, \
-            const Tensor&, const Tensor&, const Tensor&, const Metric metric, \
-            const bool ignore_query_point, const bool return_distances,       \
-            const bool sort, Tensor& neighbors_index,                         \
-            Tensor& neighbors_row_splits, Tensor& neighbors_distance);        \
-    template void HybridSearchSYCL<T, TIndex>(                                \
-            const Tensor& points, const Tensor& queries, double radius,       \
-            int max_knn, const Tensor& points_row_splits,                     \
-            const Tensor& queries_row_splits, const Tensor&, const Tensor&,   \
-            const Tensor&, const Metric metric, Tensor& neighbors_index,      \
+            const Tensor&, const Tensor&, const Tensor&, const Metric metric,  \
+            const bool ignore_query_point, const bool return_distances,        \
+            const bool sort, Tensor& neighbors_index,                          \
+            Tensor& neighbors_row_splits, Tensor& neighbors_distance);         \
+    template void HybridSearchSYCL<T, TIndex>(                                 \
+            const Tensor& points, const Tensor& queries, double radius,        \
+            int max_knn, const Tensor& points_row_splits,                      \
+            const Tensor& queries_row_splits, const Tensor&, const Tensor&,    \
+            const Tensor&, const Metric metric, Tensor& neighbors_index,       \
             Tensor& neighbors_count, Tensor& neighbors_distance);
 
 INSTANTIATE(float, int32_t)
diff --git a/cpp/tests/core/CMakeLists.txt b/cpp/tests/core/CMakeLists.txt
index a1c4826d0fa..1d71b53455c 100644
--- a/cpp/tests/core/CMakeLists.txt
+++ b/cpp/tests/core/CMakeLists.txt
@@ -22,10 +22,15 @@ target_sources(tests PRIVATE
     SYCLUtils.cpp
 )
 
+if (BUILD_CUDA_MODULE OR BUILD_SYCL_MODULE)
+    target_sources(tests PRIVATE
+        KnnIndex.cpp
+    )
+endif()
+
 if (BUILD_CUDA_MODULE)
     target_sources(tests PRIVATE
         FixedRadiusIndex.cpp
-        KnnIndex.cpp
         ParallelFor.cu
     )
 endif()
@@ -38,10 +43,8 @@ if (BUILD_ISPC_MODULE)
 endif()
 
 # TODO: cmake does not currently build this test!
-# if (BUILD_SYCL_MODULE)
-#     target_sources(tests PRIVATE
-#         ParallelForSYCL.cpp
-#     )
-#     set_source_files_properties(ParallelForSYCL.cpp PROPERTIES
-#         COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64_gen")
-# endif()
\ No newline at end of file
+#if (BUILD_SYCL_MODULE)
+#    open3d_sycl_target_sources(tests PRIVATE
+#        ParallelForSYCL.cpp
+#    )
+#endif()
\ No newline at end of file
diff --git a/cpp/tests/core/KnnIndex.cpp b/cpp/tests/core/KnnIndex.cpp
index 1ceb32d4835..fdeb8aa9c52 100644
--- a/cpp/tests/core/KnnIndex.cpp
+++ b/cpp/tests/core/KnnIndex.cpp
@@ -24,9 +24,17 @@ using namespace std;
 namespace open3d {
 namespace tests {
 
-TEST(KnnIndex, KnnSearch) {
+class KnnIndexPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(KnnIndex,
+                         KnnIndexPermuteDevices,
+                         testing::ValuesIn(KnnIndexPermuteDevices::TestCases()));
+
+TEST_P(KnnIndexPermuteDevices, KnnSearch) {
     // Define test data.
-    core::Device device = core::Device("CUDA:0");
+    core::Device device = GetParam();
+    if (device.IsCPU()) {
+        GTEST_SKIP() << "KnnIndex does not support CPU device.";
+    }
     core::Tensor dataset_points = core::Tensor::Init<float>({{0.0, 0.0, 0.0},
                                                              {0.0, 0.0, 0.1},
                                                              {0.0, 0.0, 0.2},
@@ -149,9 +157,12 @@ TEST(KnnIndex, KnnSearch) {
     EXPECT_TRUE(distances.AllClose(gt_distances));
 }
 
-TEST(KnnIndex, KnnSearchHighdim) {
+TEST_P(KnnIndexPermuteDevices, KnnSearchHighdim) {
     // Define test data.
-    core::Device device = core::Device("CUDA:0");
+    core::Device device = GetParam();
+    if (device.IsCPU()) {
+        GTEST_SKIP() << "KnnIndex does not support CPU device.";
+    }
     core::Tensor dataset_points = core::Tensor::Init<float>({{0.0, 0.0, 0.0},
                                                              {0.0, 0.0, 0.1},
                                                              {0.0, 0.0, 0.2},
@@ -231,9 +242,12 @@ TEST(KnnIndex, KnnSearchHighdim) {
     EXPECT_TRUE(distances64.AllClose(gt_distances));
 }
 
-TEST(KnnIndex, KnnSearchBatch) {
+TEST_P(KnnIndexPermuteDevices, KnnSearchBatch) {
     // Define test data.
-    core::Device device = core::Device("CUDA:0");
+    core::Device device = GetParam();
+    if (device.IsCPU()) {
+        GTEST_SKIP() << "KnnIndex does not support CPU device.";
+    }
     core::Tensor dataset_points = core::Tensor::Init<float>(
             {{0.719, 0.128, 0.431}, {0.764, 0.970, 0.678},
              {0.692, 0.786, 0.211}, {0.692, 0.969, 0.942},
@@ -303,8 +317,8 @@ TEST(KnnIndex, KnnSearchBatch) {
             index32.SearchKnn(query_points, queries_row_splits, 3);
     EXPECT_EQ(indices.GetShape(), shape);
     EXPECT_EQ(distances.GetShape(), shape);
-    EXPECT_TRUE(indices.AllClose(gt_indices));
-    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3));
+    EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString();
+    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) << distances.ToString();
 
     // int64
     // Set up Knn index.
@@ -327,8 +341,8 @@ TEST(KnnIndex, KnnSearchBatch) {
             index64.SearchKnn(query_points, queries_row_splits, 3);
     EXPECT_EQ(indices.GetShape(), shape);
     EXPECT_EQ(distances.GetShape(), shape);
-    EXPECT_TRUE(indices.AllClose(gt_indices));
-    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3));
+    EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString();
+    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) << distances.ToString();
 }
 
 }  // namespace tests

From 058c09f7a6a4e39b7a70ecfbe33714d6d62ca418 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 16 Mar 2026 06:33:45 +0000
Subject: [PATCH 13/14] Apply automatic code formatting

---
 cpp/open3d/core/nns/KnnIndex.h                |  8 ++++----
 cpp/open3d/core/nns/NearestNeighborSearch.cpp | 13 ++++++-------
 cpp/tests/core/KnnIndex.cpp                   | 13 ++++++++-----
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/cpp/open3d/core/nns/KnnIndex.h b/cpp/open3d/core/nns/KnnIndex.h
index 6628c92850c..4ac3b7193f8 100644
--- a/cpp/open3d/core/nns/KnnIndex.h
+++ b/cpp/open3d/core/nns/KnnIndex.h
@@ -47,10 +47,10 @@ class KnnIndex : public NNSIndex {
 
     /// \brief Parameterized Constructor.
     ///
-    /// \param dataset_points Provides a set of data points as Tensor for nearest
-    /// neighbor search. CPU tensors use NanoFlann through
-    /// open3d::core::nns::NearestNeighborSearch, while CUDA and SYCL tensors are
-    /// handled by this class.
+    /// \param dataset_points Provides a set of data points as Tensor for
+    /// nearest neighbor search. CPU tensors use NanoFlann through
+    /// open3d::core::nns::NearestNeighborSearch, while CUDA and SYCL tensors
+    /// are handled by this class.
     KnnIndex(const Tensor& dataset_points);
     KnnIndex(const Tensor& dataset_points, const Dtype& index_dtype);
     ~KnnIndex();
diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.cpp b/cpp/open3d/core/nns/NearestNeighborSearch.cpp
index 920971beaed..f7889f737bb 100644
--- a/cpp/open3d/core/nns/NearestNeighborSearch.cpp
+++ b/cpp/open3d/core/nns/NearestNeighborSearch.cpp
@@ -52,12 +52,12 @@ bool NearestNeighborSearch::FixedRadiusIndex(std::optional<double> radius) {
     } else {
         if (dataset_points_.IsSYCL()) {
             if (!radius.has_value()) {
-                utility::LogError("radius is required for SYCL FixedRadiusIndex.");
+                utility::LogError(
+                        "radius is required for SYCL FixedRadiusIndex.");
             }
             fixed_radius_index_.reset(new nns::FixedRadiusIndex());
-            return fixed_radius_index_->SetTensorData(dataset_points_,
-                                                      radius.value(),
-                                                      index_dtype_);
+            return fixed_radius_index_->SetTensorData(
+                    dataset_points_, radius.value(), index_dtype_);
         }
         return SetIndex();
     }
@@ -83,9 +83,8 @@ bool NearestNeighborSearch::HybridIndex(std::optional<double> radius) {
                 utility::LogError("radius is required for SYCL HybridIndex.");
             }
             fixed_radius_index_.reset(new nns::FixedRadiusIndex());
-            return fixed_radius_index_->SetTensorData(dataset_points_,
-                                                      radius.value(),
-                                                      index_dtype_);
+            return fixed_radius_index_->SetTensorData(
+                    dataset_points_, radius.value(), index_dtype_);
         }
         return SetIndex();
     }
diff --git a/cpp/tests/core/KnnIndex.cpp b/cpp/tests/core/KnnIndex.cpp
index fdeb8aa9c52..a5d63beebd1 100644
--- a/cpp/tests/core/KnnIndex.cpp
+++ b/cpp/tests/core/KnnIndex.cpp
@@ -25,9 +25,10 @@ namespace open3d {
 namespace tests {
 
 class KnnIndexPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(KnnIndex,
-                         KnnIndexPermuteDevices,
-                         testing::ValuesIn(KnnIndexPermuteDevices::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        KnnIndex,
+        KnnIndexPermuteDevices,
+        testing::ValuesIn(KnnIndexPermuteDevices::TestCases()));
 
 TEST_P(KnnIndexPermuteDevices, KnnSearch) {
     // Define test data.
@@ -318,7 +319,8 @@ TEST_P(KnnIndexPermuteDevices, KnnSearchBatch) {
     EXPECT_EQ(indices.GetShape(), shape);
     EXPECT_EQ(distances.GetShape(), shape);
     EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString();
-    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) << distances.ToString();
+    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3))
+            << distances.ToString();
 
     // int64
     // Set up Knn index.
@@ -342,7 +344,8 @@ TEST_P(KnnIndexPermuteDevices, KnnSearchBatch) {
     EXPECT_EQ(indices.GetShape(), shape);
     EXPECT_EQ(distances.GetShape(), shape);
     EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString();
-    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3)) << distances.ToString();
+    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3))
+            << distances.ToString();
 }
 
 }  // namespace tests

From 283039b52a0d8e954e4cc87c807b55da23b667a0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 16 Mar 2026 23:27:54 +0000
Subject: [PATCH 14/14] Add SYCL support for Transform kernel: TransformPoints,
 TransformNormals, RotatePoints, RotateNormals

Co-authored-by: ssheorey <41028320+ssheorey@users.noreply.github.com>
---
 cpp/open3d/t/geometry/kernel/CMakeLists.txt   |   6 +
 cpp/open3d/t/geometry/kernel/Transform.cpp    |  24 ++++
 cpp/open3d/t/geometry/kernel/Transform.h      |  14 +++
 cpp/open3d/t/geometry/kernel/TransformImpl.h  |   4 +
 .../t/geometry/kernel/TransformSYCL.cpp       | 104 ++++++++++++++++++
 5 files changed, 152 insertions(+)
 create mode 100644 cpp/open3d/t/geometry/kernel/TransformSYCL.cpp

diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
index 223c4d4c7a8..560745b6b5d 100644
--- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
@@ -30,6 +30,12 @@ if (BUILD_CUDA_MODULE)
     )
 endif()
 
+if (BUILD_SYCL_MODULE)
+    open3d_sycl_target_sources(tgeometry_kernel PRIVATE
+        TransformSYCL.cpp
+    )
+endif()
+
 if (WITH_IPP)
     target_sources(tgeometry_kernel PRIVATE
         IPPImage.cpp
diff --git a/cpp/open3d/t/geometry/kernel/Transform.cpp b/cpp/open3d/t/geometry/kernel/Transform.cpp
index 06ec1de454f..185be29bfc2 100644
--- a/cpp/open3d/t/geometry/kernel/Transform.cpp
+++ b/cpp/open3d/t/geometry/kernel/Transform.cpp
@@ -31,6 +31,12 @@ void TransformPoints(const core::Tensor& transformation, core::Tensor& points) {
     } else if (points.IsCUDA()) {
         CUDA_CALL(TransformPointsCUDA, transformation_contiguous,
                   points_contiguous);
+    } else if (points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TransformPointsSYCL(transformation_contiguous, points_contiguous);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device");
     }
@@ -53,6 +59,12 @@ void TransformNormals(const core::Tensor& transformation,
     } else if (normals.IsCUDA()) {
         CUDA_CALL(TransformNormalsCUDA, transformation_contiguous,
                   normals_contiguous);
+    } else if (normals.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TransformNormalsSYCL(transformation_contiguous, normals_contiguous);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device");
     }
@@ -78,6 +90,12 @@ void RotatePoints(const core::Tensor& R,
     } else if (points.IsCUDA()) {
         CUDA_CALL(RotatePointsCUDA, R_contiguous, points_contiguous,
                   center_contiguous);
+    } else if (points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        RotatePointsSYCL(R_contiguous, points_contiguous, center_contiguous);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device");
     }
@@ -97,6 +115,12 @@ void RotateNormals(const core::Tensor& R, core::Tensor& normals) {
         RotateNormalsCPU(R_contiguous, normals_contiguous);
     } else if (normals.IsCUDA()) {
         CUDA_CALL(RotateNormalsCUDA, R_contiguous, normals_contiguous);
+    } else if (normals.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        RotateNormalsSYCL(R_contiguous, normals_contiguous);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device");
     }
diff --git a/cpp/open3d/t/geometry/kernel/Transform.h b/cpp/open3d/t/geometry/kernel/Transform.h
index d64d860572e..e51ebae73c4 100644
--- a/cpp/open3d/t/geometry/kernel/Transform.h
+++ b/cpp/open3d/t/geometry/kernel/Transform.h
@@ -52,6 +52,20 @@ void RotatePointsCUDA(const core::Tensor& R,
 void RotateNormalsCUDA(const core::Tensor& R, core::Tensor& normals);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void TransformPointsSYCL(const core::Tensor& transformation,
+                         core::Tensor& points);
+
+void TransformNormalsSYCL(const core::Tensor& transformation,
+                          core::Tensor& normals);
+
+void RotatePointsSYCL(const core::Tensor& R,
+                      core::Tensor& points,
+                      const core::Tensor& center);
+
+void RotateNormalsSYCL(const core::Tensor& R, core::Tensor& normals);
+#endif
+
 }  // namespace transform
 }  // namespace kernel
 }  // namespace geometry
diff --git a/cpp/open3d/t/geometry/kernel/TransformImpl.h b/cpp/open3d/t/geometry/kernel/TransformImpl.h
index 14c08a01e52..cedcc22fd87 100644
--- a/cpp/open3d/t/geometry/kernel/TransformImpl.h
+++ b/cpp/open3d/t/geometry/kernel/TransformImpl.h
@@ -87,6 +87,8 @@ OPEN3D_HOST_DEVICE OPEN3D_FORCE_INLINE void RotateNormalsKernel(
     normals_ptr[2] = x[2];
 }
 
+#ifndef OPEN3D_SKIP_TRANSFORM_MAIN
+
 #ifdef __CUDACC__
 void TransformPointsCUDA
 #else
@@ -167,6 +169,8 @@ void RotateNormalsCPU
     });
 }
 
+#endif  // OPEN3D_SKIP_TRANSFORM_MAIN
+
 }  // namespace transform
 }  // namespace kernel
 }  // namespace geometry
diff --git a/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp b/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp
new file mode 100644
index 00000000000..9bfd9078e3a
--- /dev/null
+++ b/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp
@@ -0,0 +1,104 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+// Skip the CPU/CUDA main function definitions; only use the helper templates.
+#define OPEN3D_SKIP_TRANSFORM_MAIN
+#include "open3d/t/geometry/kernel/TransformImpl.h"
+#undef OPEN3D_SKIP_TRANSFORM_MAIN
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+
+namespace open3d {
+namespace t {
+namespace geometry {
+namespace kernel {
+namespace transform {
+
+void TransformPointsSYCL(const core::Tensor& transformation,
+                         core::Tensor& points) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(
+                    points.GetDevice());
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(points.GetDtype(), [&]() {
+        scalar_t* points_ptr = points.GetDataPtr<scalar_t>();
+        const scalar_t* transformation_ptr =
+                transformation.GetDataPtr<scalar_t>();
+        const int64_t n = points.GetLength();
+        queue.parallel_for(sycl::range<1>{(size_t)n},
+                           [=](sycl::id<1> id) {
+                               TransformPointsKernel(
+                                       transformation_ptr,
+                                       points_ptr + 3 * id[0]);
+                           })
+                .wait_and_throw();
+    });
+}
+
+void TransformNormalsSYCL(const core::Tensor& transformation,
+                          core::Tensor& normals) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(
+                    normals.GetDevice());
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(normals.GetDtype(), [&]() {
+        scalar_t* normals_ptr = normals.GetDataPtr<scalar_t>();
+        const scalar_t* transformation_ptr =
+                transformation.GetDataPtr<scalar_t>();
+        const int64_t n = normals.GetLength();
+        queue.parallel_for(sycl::range<1>{(size_t)n},
+                           [=](sycl::id<1> id) {
+                               TransformNormalsKernel(
+                                       transformation_ptr,
+                                       normals_ptr + 3 * id[0]);
+                           })
+                .wait_and_throw();
+    });
+}
+
+void RotatePointsSYCL(const core::Tensor& R,
+                      core::Tensor& points,
+                      const core::Tensor& center) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(
+                    points.GetDevice());
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(points.GetDtype(), [&]() {
+        scalar_t* points_ptr = points.GetDataPtr<scalar_t>();
+        const scalar_t* R_ptr = R.GetDataPtr<scalar_t>();
+        const scalar_t* center_ptr = center.GetDataPtr<scalar_t>();
+        const int64_t n = points.GetLength();
+        queue.parallel_for(sycl::range<1>{(size_t)n},
+                           [=](sycl::id<1> id) {
+                               RotatePointsKernel(R_ptr,
+                                                  points_ptr + 3 * id[0],
+                                                  center_ptr);
+                           })
+                .wait_and_throw();
+    });
+}
+
+void RotateNormalsSYCL(const core::Tensor& R, core::Tensor& normals) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(
+                    normals.GetDevice());
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(normals.GetDtype(), [&]() {
+        scalar_t* normals_ptr = normals.GetDataPtr<scalar_t>();
+        const scalar_t* R_ptr = R.GetDataPtr<scalar_t>();
+        const int64_t n = normals.GetLength();
+        queue.parallel_for(
+                     sycl::range<1>{(size_t)n},
+                     [=](sycl::id<1> id) {
+                         RotateNormalsKernel(R_ptr, normals_ptr + 3 * id[0]);
+                     })
+                .wait_and_throw();
+    });
+}
+
+}  // namespace transform
+}  // namespace kernel
+}  // namespace geometry
+}  // namespace t
+}  // namespace open3d