diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt
index 210cc33058a..7821b4a8f97 100644
--- a/cpp/open3d/core/CMakeLists.txt
+++ b/cpp/open3d/core/CMakeLists.txt
@@ -102,6 +102,7 @@ open3d_sycl_target_sources(core_impl PRIVATE
     linalg/LeastSquaresSYCL.cpp
     linalg/LUSYCL.cpp
     linalg/MatmulSYCL.cpp
+    nns/KnnSearchOpsSYCL.cpp
     linalg/SolveSYCL.cpp
     linalg/SVDSYCL.cpp
     linalg/TriSYCL.cpp
diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.cpp b/cpp/open3d/core/nns/FixedRadiusIndex.cpp
index 167a6701a32..a7316b1c8ff 100644
--- a/cpp/open3d/core/nns/FixedRadiusIndex.cpp
+++ b/cpp/open3d/core/nns/FixedRadiusIndex.cpp
@@ -104,8 +104,16 @@ bool FixedRadiusIndex::SetTensorData(const Tensor &dataset_points,
         CALL_BUILD(double, BuildSpatialHashTableCUDA)
 #else
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
+                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with "
                 "-DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        return true;
+#else
+        utility::LogError(
+                "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with "
+                "-DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         CALL_BUILD(float, BuildSpatialHashTableCPU)
@@ -171,8 +179,18 @@ std::tuple<Tensor, Tensor, Tensor> FixedRadiusIndex::SearchRadius(
         });
 #else
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
+                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with "
                 "-DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
+            FixedRadiusSearchSYCL<scalar_t, int_t>(RADIUS_PARAMETERS);
+        });
+#else
+        utility::LogError(
+                "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with "
+                "-DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
@@ -240,8 +258,18 @@ std::tuple<Tensor, Tensor, Tensor> FixedRadiusIndex::SearchHybrid(
         });
 #else
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
+                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with "
                 "-DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
+            HybridSearchSYCL<scalar_t, int_t>(HYBRID_PARAMETERS);
+        });
+#else
+        utility::LogError(
+                "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with "
+                "-DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
diff --git a/cpp/open3d/core/nns/FixedRadiusIndex.h b/cpp/open3d/core/nns/FixedRadiusIndex.h
index 838c1ccc484..d3247bb7d29 100644
--- a/cpp/open3d/core/nns/FixedRadiusIndex.h
+++ b/cpp/open3d/core/nns/FixedRadiusIndex.h
@@ -376,6 +376,40 @@ void HybridSearchCUDA(const Tensor& points,
                       Tensor& neighbors_distance);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+template <class T, class TIndex>
+void FixedRadiusSearchSYCL(const Tensor& points,
+                           const Tensor& queries,
+                           double radius,
+                           const Tensor& points_row_splits,
+                           const Tensor& queries_row_splits,
+                           const Tensor& hash_table_splits,
+                           const Tensor& hash_table_index,
+                           const Tensor& hash_table_cell_splits,
+                           const Metric metric,
+                           const bool ignore_query_point,
+                           const bool return_distances,
+                           const bool sort,
+                           Tensor& neighbors_index,
+                           Tensor& neighbors_row_splits,
+                           Tensor& neighbors_distance);
+
+template <class T, class TIndex>
+void HybridSearchSYCL(const Tensor& points,
+                      const Tensor& queries,
+                      double radius,
+                      int max_knn,
+                      const Tensor& points_row_splits,
+                      const Tensor& queries_row_splits,
+                      const Tensor& hash_table_splits,
+                      const Tensor& hash_table_index,
+                      const Tensor& hash_table_cell_splits,
+                      const Metric metric,
+                      Tensor& neighbors_index,
+                      Tensor& neighbors_count,
+                      Tensor& neighbors_distance);
+#endif
+
 /// \class FixedRadiusIndex
 ///
 /// \brief FixedRadiusIndex for nearest neighbor range search.
diff --git a/cpp/open3d/core/nns/KnnIndex.cpp b/cpp/open3d/core/nns/KnnIndex.cpp
index 2d1f5de6b51..a2c24233a2e 100644
--- a/cpp/open3d/core/nns/KnnIndex.cpp
+++ b/cpp/open3d/core/nns/KnnIndex.cpp
@@ -67,6 +67,17 @@ bool KnnIndex::SetTensorData(const Tensor& dataset_points,
         utility::LogError(
                 "GPU Tensor is not supported when -DBUILD_CUDA_MODULE=OFF. "
                 "Please recompile Open3d With -DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (dataset_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        dataset_points_ = dataset_points.Contiguous();
+        points_row_splits_ = points_row_splits.Contiguous();
+        index_dtype_ = index_dtype;
+        return true;
+#else
+        utility::LogError(
+                "SYCL Tensor is not supported when -DBUILD_SYCL_MODULE=OFF. "
+                "Please recompile Open3D with -DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         utility::LogError(
@@ -124,13 +135,24 @@ std::pair<Tensor, Tensor> KnnIndex::SearchKnn(const Tensor& query_points,
         });
 #else
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
+                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3D with "
                 "-DBUILD_CUDA_MODULE=ON.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        const Dtype index_dtype = GetIndexDtype();
+        DISPATCH_FLOAT_INT_DTYPE_TO_TEMPLATE(dtype, index_dtype, [&]() {
+            KnnSearchSYCL<scalar_t, int_t>(KNN_PARAMETERS);
+        });
+#else
+        utility::LogError(
+                "-DBUILD_SYCL_MODULE=OFF. Please compile Open3D with "
+                "-DBUILD_SYCL_MODULE=ON.");
 #endif
     } else {
         utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please compile Open3d with "
-                "-DBUILD_CUDA_MODULE=ON.");
+                "KnnIndex only supports CUDA and SYCL tensors. Please use "
+                "NanoFlannIndex instead for CPU tensors.");
     }
     return std::make_pair(neighbors_index, neighbors_distance);
 }
diff --git a/cpp/open3d/core/nns/KnnIndex.h b/cpp/open3d/core/nns/KnnIndex.h
index 458a3dd0b35..4ac3b7193f8 100644
--- a/cpp/open3d/core/nns/KnnIndex.h
+++ b/cpp/open3d/core/nns/KnnIndex.h
@@ -29,14 +29,28 @@ void KnnSearchCUDA(const Tensor& points,
                    Tensor& neighbors_distance);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+template <class T, class TIndex>
+void KnnSearchSYCL(const Tensor& points,
+                   const Tensor& points_row_splits,
+                   const Tensor& queries,
+                   const Tensor& queries_row_splits,
+                   int knn,
+                   Tensor& neighbors_index,
+                   Tensor& neighbors_row_splits,
+                   Tensor& neighbors_distance);
+#endif
+
 class KnnIndex : public NNSIndex {
 public:
     KnnIndex();
 
     /// \brief Parameterized Constructor.
     ///
-    /// \param dataset_points Provides a set of data points as Tensor for KDTree
-    /// construction.
+    /// \param dataset_points Provides a set of data points as Tensor for
+    /// nearest neighbor search. CPU tensors use NanoFlann through
+    /// open3d::core::nns::NearestNeighborSearch, while CUDA and SYCL tensors
+    /// are handled by this class.
     KnnIndex(const Tensor& dataset_points);
     KnnIndex(const Tensor& dataset_points, const Dtype& index_dtype);
     ~KnnIndex();
diff --git a/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp
new file mode 100644
index 00000000000..2d20e4b86fa
--- /dev/null
+++ b/cpp/open3d/core/nns/KnnSearchOpsSYCL.cpp
@@ -0,0 +1,936 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <algorithm>
+#include <limits>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#include <sycl/sycl.hpp>
+
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/linalg/AddMM.h"
+#include "open3d/core/nns/FixedRadiusIndex.h"
+#include "open3d/core/nns/KnnIndex.h"
+#include "open3d/core/nns/NeighborSearchAllocator.h"
+#include "open3d/utility/Helper.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace nns {
+
+// SYCL nearest-neighbor search uses tiled distance evaluation with AddMM,
+// batched threshold/count helpers, and query-local top-k selection/merge.
+namespace {
+
+// Chooses a query/point tile shape that limits temporary distance storage.
+inline void ChooseTileSizeSYCL(int64_t num_queries,
+                               int64_t num_points,
+                               int64_t element_size,
+                               int64_t& tile_queries,
+                               int64_t& tile_points) {
+    constexpr int64_t kTargetTileBytes = 8 * 1024 * 1024;
+    tile_queries = std::min<int64_t>(num_queries, 128);
+    tile_queries = std::max<int64_t>(tile_queries, 1);
+    tile_points = std::max<int64_t>(
+            kTargetTileBytes / (tile_queries * element_size), int64_t(256));
+    tile_points = std::min<int64_t>(tile_points, num_points);
+    tile_points = std::max<int64_t>(tile_points, 1);
+}
+
+// Computes squared L2 distances from one query to all points with a SYCL
+// kernel. This path is still used by the small single-row helpers below.
+template <typename T>
+void ComputeSquaredDistancesSYCL(const Device& device,
+                                 const T* points_ptr,
+                                 int64_t num_points,
+                                 int64_t dimension,
+                                 const T* query_ptr,
+                                 T* distances_ptr) {
+    if (num_points == 0) {
+        return;
+    }
+
+    constexpr int64_t kWorkGroupSize = 128;
+    const int64_t global_size =
+            utility::DivUp(num_points, kWorkGroupSize) * kWorkGroupSize;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<T, 1> local_query(sycl::range<1>(dimension), cgh);
+        cgh.parallel_for(
+                sycl::nd_range<1>(sycl::range<1>(global_size),
+                                  sycl::range<1>(kWorkGroupSize)),
+                [=](sycl::nd_item<1> item) {
+                    const int64_t lid = item.get_local_linear_id();
+                    for (int64_t d = lid; d < dimension;
+                         d += item.get_local_range(0)) {
+                        local_query[d] = query_ptr[d];
+                    }
+                    item.barrier(sycl::access::fence_space::local_space);
+
+                    const int64_t point_idx = item.get_global_linear_id();
+                    if (point_idx >= num_points) {
+                        return;
+                    }
+
+                    const T* point = points_ptr + point_idx * dimension;
+                    T dist = 0;
+                    for (int64_t d = 0; d < dimension; ++d) {
+                        const T diff = point[d] - local_query[d];
+                        dist += diff * diff;
+                    }
+                    distances_ptr[point_idx] = dist;
+                });
+    });
+}
+
+// Counts how many entries per query are within a threshold using one launch per
+// tile, avoiding per-query device algorithm dispatch.
+template <typename T>
+void CountWithinThresholdQueriesSYCL(const Device& device,
+                                     const T* distances_ptr,
+                                     int64_t distance_query_stride,
+                                     int64_t num_queries,
+                                     int64_t num_points,
+                                     T threshold,
+                                     int64_t* counts_ptr) {
+    if (num_queries == 0 || num_points == 0) {
+        return;
+    }
+
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) {
+        const int64_t query_idx = id[0];
+        int64_t local_count = 0;
+        const T* query_distances =
+                distances_ptr + query_idx * distance_query_stride;
+        for (int64_t point_idx = 0; point_idx < num_points; ++point_idx) {
+            if (query_distances[point_idx] <= threshold) {
+                ++local_count;
+            }
+        }
+        counts_ptr[query_idx] += local_count;
+    });
+}
+
+// Appends per-query threshold matches into the final neighbor buffers using
+// caller-provided query offsets.
+template <typename T, typename TIndex>
+void GatherWithinThresholdQueriesSYCL(const Device& device,
+                                      const T* distances_ptr,
+                                      int64_t distance_query_stride,
+                                      int64_t num_queries,
+                                      int64_t num_points,
+                                      T threshold,
+                                      TIndex index_offset,
+                                      int64_t* offsets_ptr,
+                                      TIndex* out_indices_ptr,
+                                      T* out_distances_ptr) {
+    if (num_queries == 0 || num_points == 0) {
+        return;
+    }
+
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) {
+        const int64_t query_idx = id[0];
+        const T* query_distances =
+                distances_ptr + query_idx * distance_query_stride;
+        int64_t offset = offsets_ptr[query_idx];
+        for (int64_t point_idx = 0; point_idx < num_points; ++point_idx) {
+            const T dist = query_distances[point_idx];
+            if (dist <= threshold) {
+                out_indices_ptr[offset] = index_offset + point_idx;
+                if (out_distances_ptr != nullptr) {
+                    out_distances_ptr[offset] = dist;
+                }
+                ++offset;
+            }
+        }
+        offsets_ptr[query_idx] = offset;
+    });
+}
+
+// Selects the smallest k distances per query with OneDPL partial_sort while
+// reusing scratch index buffers allocated by the caller.
+template <typename T, typename TIndex>
+void SelectTopKQueriesSYCL(const Device& device,
+                           const T* distances_ptr,
+                           int64_t distance_query_stride,
+                           int64_t num_queries,
+                           int64_t num_points,
+                           int64_t knn,
+                           TIndex index_offset,
+                           TIndex* scratch_indices_ptr,
+                           int64_t scratch_query_stride,
+                           TIndex* out_indices_ptr,
+                           T* out_distances_ptr,
+                           int64_t out_query_stride,
+                           bool use_threshold = false,
+                           T threshold = T(0)) {
+    if (num_queries == 0 || num_points == 0 || knn <= 0) {
+        return;
+    }
+
+    const T inf = std::numeric_limits<T>::max();
+    const int64_t actual_knn = std::min(knn, num_points);
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto policy = oneapi::dpl::execution::make_device_policy(queue);
+
+    queue.parallel_for(
+            sycl::range<2>(num_queries, num_points), [=](sycl::id<2> id) {
+                scratch_indices_ptr[id[0] * scratch_query_stride + id[1]] =
+                        static_cast<TIndex>(id[1]);
+            });
+
+    for (int64_t query_idx = 0; query_idx < num_queries; ++query_idx) {
+        TIndex* query_indices =
+                scratch_indices_ptr + query_idx * scratch_query_stride;
+        const T* query_distances =
+                distances_ptr + query_idx * distance_query_stride;
+        std::partial_sort(policy, query_indices, query_indices + actual_knn,
+                          query_indices + num_points,
+                          [query_distances](TIndex lhs, TIndex rhs) {
+                              const T lhs_dist = query_distances[lhs];
+                              const T rhs_dist = query_distances[rhs];
+                              if (lhs_dist < rhs_dist) {
+                                  return true;
+                              }
+                              if (rhs_dist < lhs_dist) {
+                                  return false;
+                              }
+                              return lhs < rhs;
+                          });
+    }
+
+    queue.parallel_for(sycl::range<2>(num_queries, knn), [=](sycl::id<2> id) {
+        const int64_t query_idx = id[0];
+        const int64_t k = id[1];
+        TIndex* query_out_indices =
+                out_indices_ptr + query_idx * out_query_stride;
+        T* query_out_distances =
+                out_distances_ptr + query_idx * out_query_stride;
+
+        if (k >= actual_knn) {
+            query_out_indices[k] = TIndex(-1);
+            query_out_distances[k] = inf;
+            return;
+        }
+
+        const TIndex local_idx =
+                scratch_indices_ptr[query_idx * scratch_query_stride + k];
+        const T dist =
+                distances_ptr[query_idx * distance_query_stride + local_idx];
+        if (use_threshold && dist > threshold) {
+            query_out_indices[k] = TIndex(-1);
+            query_out_distances[k] = inf;
+            return;
+        }
+        query_out_indices[k] = index_offset + local_idx;
+        query_out_distances[k] = dist;
+    });
+}
+
+// Merges two sorted top-k query buffers into a new top-k query buffer with
+// OneDPL partial_sort over 2k candidates.
+template <typename T, typename TIndex>
+void MergeTopKQueriesSYCL(const Device& device,
+                          const T* current_distances_ptr,
+                          const TIndex* current_indices_ptr,
+                          int64_t current_query_stride,
+                          const T* candidate_distances_ptr,
+                          const TIndex* candidate_indices_ptr,
+                          int64_t candidate_query_stride,
+                          int64_t num_queries,
+                          int64_t knn,
+                          TIndex* scratch_indices_ptr,
+                          int64_t scratch_query_stride,
+                          TIndex* out_indices_ptr,
+                          T* out_distances_ptr,
+                          int64_t out_query_stride) {
+    if (num_queries == 0 || knn <= 0) {
+        return;
+    }
+
+    const T inf = std::numeric_limits<T>::max();
+    const int64_t combined_cols = 2 * knn;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    auto policy = oneapi::dpl::execution::make_device_policy(queue);
+
+    queue.parallel_for(
+            sycl::range<2>(num_queries, combined_cols), [=](sycl::id<2> id) {
+                scratch_indices_ptr[id[0] * scratch_query_stride + id[1]] =
+                        static_cast<TIndex>(id[1]);
+            });
+
+    for (int64_t query_idx = 0; query_idx < num_queries; ++query_idx) {
+        TIndex* query_indices =
+                scratch_indices_ptr + query_idx * scratch_query_stride;
+        const T* query_current_dist =
+                current_distances_ptr + query_idx * current_query_stride;
+        const TIndex* query_current_idx =
+                current_indices_ptr + query_idx * current_query_stride;
+        const T* query_candidate_dist =
+                candidate_distances_ptr + query_idx * candidate_query_stride;
+        const TIndex* query_candidate_idx =
+                candidate_indices_ptr + query_idx * candidate_query_stride;
+
+        std::partial_sort(
+                policy, query_indices, query_indices + knn,
+                query_indices + combined_cols,
+                [query_current_dist, query_current_idx, query_candidate_dist,
+                 query_candidate_idx, knn](TIndex lhs, TIndex rhs) {
+                    const bool lhs_is_current = lhs < knn;
+                    const bool rhs_is_current = rhs < knn;
+                    const TIndex lhs_idx =
+                            lhs_is_current ? query_current_idx[lhs]
+                                           : query_candidate_idx[lhs - knn];
+                    const TIndex rhs_idx =
+                            rhs_is_current ? query_current_idx[rhs]
+                                           : query_candidate_idx[rhs - knn];
+                    const T lhs_dist =
+                            lhs_is_current ? query_current_dist[lhs]
+                                           : query_candidate_dist[lhs - knn];
+                    const T rhs_dist =
+                            rhs_is_current ? query_current_dist[rhs]
+                                           : query_candidate_dist[rhs - knn];
+                    const bool lhs_valid = lhs_idx >= 0;
+                    const bool rhs_valid = rhs_idx >= 0;
+                    if (lhs_valid != rhs_valid) {
+                        return lhs_valid;
+                    }
+                    if (lhs_dist < rhs_dist) {
+                        return true;
+                    }
+                    if (rhs_dist < lhs_dist) {
+                        return false;
+                    }
+                    return lhs_idx < rhs_idx;
+                });
+    }
+
+    queue.parallel_for(sycl::range<2>(num_queries, knn), [=](sycl::id<2> id) {
+        const int64_t query_idx = id[0];
+        const int64_t k = id[1];
+        TIndex* query_out_indices =
+                out_indices_ptr + query_idx * out_query_stride;
+        T* query_out_distances =
+                out_distances_ptr + query_idx * out_query_stride;
+        const TIndex source =
+                scratch_indices_ptr[query_idx * scratch_query_stride + k];
+        const bool is_current = source < knn;
+        const int64_t offset = is_current ? source : source - knn;
+        const TIndex idx =
+                is_current
+                        ? current_indices_ptr[query_idx * current_query_stride +
+                                              offset]
+                        : candidate_indices_ptr[query_idx *
+                                                        candidate_query_stride +
+                                                offset];
+        const T dist =
+                is_current
+                        ? current_distances_ptr[query_idx *
+                                                        current_query_stride +
+                                                offset]
+                        : candidate_distances_ptr
+                                  [query_idx * candidate_query_stride + offset];
+        if (idx < 0) {
+            query_out_indices[k] = TIndex(-1);
+            query_out_distances[k] = inf;
+        } else {
+            query_out_indices[k] = idx;
+            query_out_distances[k] = dist;
+        }
+    });
+}
+
+// Finalizes hybrid-search counts and clears any unused tail elements in the
+// fixed-width output tensors.
+template <typename T, typename TIndex>
+void FinalizeHybridResultsSYCL(const Device& device,
+                               const int64_t* counts_ptr,
+                               int64_t num_queries,
+                               int64_t max_knn,
+                               TIndex* neighbors_index_ptr,
+                               T* neighbors_distance_ptr,
+                               TIndex* neighbors_count_ptr) {
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>(num_queries), [=](sycl::id<1> id) {
+        const int64_t query_idx = id[0];
+        const int64_t clipped_count =
+                std::min<int64_t>(counts_ptr[query_idx], max_knn);
+        neighbors_count_ptr[query_idx] = static_cast<TIndex>(clipped_count);
+        for (int64_t k = clipped_count; k < max_knn; ++k) {
+            neighbors_index_ptr[query_idx * max_knn + k] = TIndex(-1);
+            neighbors_distance_ptr[query_idx * max_knn + k] = T(0);
+        }
+    });
+}
+
+}  // namespace
+
+// Batched KNN search with tiled AddMM distance evaluation and query-local top-k
+// selection/merge on each tile.
+template <class T, class TIndex>
+void KnnSearchSYCL(const Tensor& points,
+                   const Tensor& points_row_splits,
+                   const Tensor& queries,
+                   const Tensor& queries_row_splits,
+                   int knn,
+                   Tensor& neighbors_index,
+                   Tensor& neighbors_row_splits,
+                   Tensor& neighbors_distance) {
+    const Device device = points.GetDevice();
+    const Dtype dtype = points.GetDtype();
+    const Dtype index_dtype = Dtype::FromType<TIndex>();
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    const int batch_size = points_row_splits.GetShape(0) - 1;
+    std::vector<NeighborSearchAllocator<T, TIndex>> batch_output_allocators(
+            batch_size, NeighborSearchAllocator<T, TIndex>(device));
+
+    int64_t* neighbors_row_splits_ptr =
+            neighbors_row_splits.GetDataPtr<int64_t>();
+    int64_t last_neighbors_count = 0;
+    int64_t batch_knn = 0;
+
+    for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+        const int64_t point_begin =
+                points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end =
+                points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t query_begin =
+                queries_row_splits[batch_idx].Item<int64_t>();
+        const int64_t query_end =
+                queries_row_splits[batch_idx + 1].Item<int64_t>();
+
+        const Tensor points_i = points.Slice(0, point_begin, point_end);
+        const Tensor queries_i = queries.Slice(0, query_begin, query_end);
+        const int64_t num_points_i = points_i.GetShape(0);
+        const int64_t num_queries_i = queries_i.GetShape(0);
+        batch_knn = std::min<int64_t>(knn, num_points_i);
+
+        neighbors_row_splits_ptr[query_begin] = last_neighbors_count;
+        for (int64_t q = 0; q < num_queries_i; ++q) {
+            neighbors_row_splits_ptr[query_begin + q + 1] =
+                    last_neighbors_count + (q + 1) * batch_knn;
+        }
+        last_neighbors_count += num_queries_i * batch_knn;
+
+        TIndex* indices_ptr;
+        T* distances_ptr;
+        batch_output_allocators[batch_idx].AllocIndices(
+                &indices_ptr, num_queries_i * batch_knn, TIndex(-1));
+        batch_output_allocators[batch_idx].AllocDistances(
+                &distances_ptr, num_queries_i * batch_knn, T(0));
+
+        Tensor out_indices =
+                batch_output_allocators[batch_idx].NeighborsIndex().View(
+                        {num_queries_i, batch_knn});
+        Tensor out_distances =
+                batch_output_allocators[batch_idx].NeighborsDistance().View(
+                        {num_queries_i, batch_knn});
+
+        // Precompute norms for batched distance evaluation.
+        Tensor point_norms = points_i.Mul(points_i).Sum({1});
+        Tensor query_norms = queries_i.Mul(queries_i).Sum({1});
+
+        int64_t tile_queries = 0;
+        int64_t tile_points = 0;
+        ChooseTileSizeSYCL(num_queries_i, num_points_i, sizeof(T), tile_queries,
+                           tile_points);
+
+        Tensor temp_distances =
+                Tensor::Empty({tile_queries, tile_points}, dtype, device);
+        Tensor tile_sort_indices =
+                Tensor::Empty({tile_queries, tile_points}, index_dtype, device);
+        Tensor tile_top_indices =
+                Tensor::Empty({tile_queries, batch_knn}, index_dtype, device);
+        Tensor tile_top_distances =
+                Tensor::Empty({tile_queries, batch_knn}, dtype, device);
+        Tensor best_indices =
+                Tensor::Empty({tile_queries, batch_knn}, index_dtype, device);
+        Tensor best_distances =
+                Tensor::Empty({tile_queries, batch_knn}, dtype, device);
+        Tensor merged_indices =
+                Tensor::Empty({tile_queries, batch_knn}, index_dtype, device);
+        Tensor merged_distances =
+                Tensor::Empty({tile_queries, batch_knn}, dtype, device);
+        Tensor merge_sort_indices = Tensor::Empty({tile_queries, 2 * batch_knn},
+                                                  index_dtype, device);
+
+        for (int64_t q = 0; q < num_queries_i; q += tile_queries) {
+            int64_t num_queries_iter =
+                    std::min(tile_queries, num_queries_i - q);
+            Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter);
+            Tensor query_norms_tile =
+                    query_norms.Slice(0, q, q + num_queries_iter);
+
+            Tensor best_indices_view =
+                    best_indices.Slice(0, 0, num_queries_iter);
+            Tensor best_distances_view =
+                    best_distances.Slice(0, 0, num_queries_iter);
+            best_indices_view.Fill(TIndex(-1));
+            best_distances_view.Fill(std::numeric_limits<T>::max());
+
+            for (int64_t p = 0; p < num_points_i; p += tile_points) {
+                int64_t num_points_iter =
+                        std::min(tile_points, num_points_i - p);
+                Tensor points_tile = points_i.Slice(0, p, p + num_points_iter);
+                Tensor point_norms_tile =
+                        point_norms.Slice(0, p, p + num_points_iter);
+                Tensor temp_distances_view =
+                        temp_distances.Slice(0, 0, num_queries_iter)
+                                .Slice(1, 0, num_points_iter);
+
+                AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0,
+                      0.0);
+                temp_distances_view.Add_(
+                        point_norms_tile.View({1, num_points_iter}));
+                temp_distances_view.Add_(
+                        query_norms_tile.View({num_queries_iter, 1}));
+
+                Tensor tile_top_indices_view =
+                        tile_top_indices.Slice(0, 0, num_queries_iter);
+                Tensor tile_top_distances_view =
+                        tile_top_distances.Slice(0, 0, num_queries_iter);
+                Tensor merged_indices_view =
+                        merged_indices.Slice(0, 0, num_queries_iter);
+                Tensor merged_distances_view =
+                        merged_distances.Slice(0, 0, num_queries_iter);
+
+                SelectTopKQueriesSYCL<T, TIndex>(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, batch_knn, TIndex(p),
+                        tile_sort_indices.GetDataPtr<TIndex>(),
+                        tile_sort_indices.GetStride(0),
+                        tile_top_indices_view.GetDataPtr<TIndex>(),
+                        tile_top_distances_view.GetDataPtr<T>(), batch_knn);
+                MergeTopKQueriesSYCL<T, TIndex>(
+                        device, best_distances_view.GetDataPtr<T>(),
+                        best_indices_view.GetDataPtr<TIndex>(), batch_knn,
+                        tile_top_distances_view.GetDataPtr<T>(),
+                        tile_top_indices_view.GetDataPtr<TIndex>(), batch_knn,
+                        num_queries_iter, batch_knn,
+                        merge_sort_indices.GetDataPtr<TIndex>(),
+                        merge_sort_indices.GetStride(0),
+                        merged_indices_view.GetDataPtr<TIndex>(),
+                        merged_distances_view.GetDataPtr<T>(), batch_knn);
+
+                best_indices_view.AsRvalue() = merged_indices_view;
+                best_distances_view.AsRvalue() = merged_distances_view;
+            }
+
+            out_indices.Slice(0, q, q + num_queries_iter).AsRvalue() =
+                    best_indices_view;
+            out_distances.Slice(0, q, q + num_queries_iter).AsRvalue() =
+                    best_distances_view;
+        }
+
+        queue.wait_and_throw();
+    }
+
+    if (batch_size == 1) {
+        neighbors_index = batch_output_allocators[0].NeighborsIndex().View(
+                {queries.GetShape(0), batch_knn});
+        neighbors_distance =
+                batch_output_allocators[0].NeighborsDistance().View(
+                        {queries.GetShape(0), batch_knn});
+        return;
+    }
+
+    NeighborSearchAllocator<T, TIndex> output_allocator(device);
+    int64_t neighbors_size = 0;
+    for (const auto& allocator : batch_output_allocators) {
+        neighbors_size += allocator.NeighborsIndex().GetShape(0);
+    }
+
+    TIndex* neighbors_index_ptr;
+    T* neighbors_distance_ptr;
+    output_allocator.AllocIndices(&neighbors_index_ptr, neighbors_size);
+    output_allocator.AllocDistances(&neighbors_distance_ptr, neighbors_size);
+
+    int64_t offset = 0;
+    for (const auto& allocator : batch_output_allocators) {
+        const int64_t batch_size_i = allocator.NeighborsIndex().GetShape(0);
+        if (batch_size_i == 0) {
+            continue;
+        }
+        MemoryManager::Memcpy(neighbors_index_ptr + offset, device,
+                              allocator.IndicesPtr(), device,
+                              sizeof(TIndex) * batch_size_i);
+        MemoryManager::Memcpy(neighbors_distance_ptr + offset, device,
+                              allocator.DistancesPtr(), device,
+                              sizeof(T) * batch_size_i);
+        offset += batch_size_i;
+    }
+    neighbors_index = output_allocator.NeighborsIndex();
+    neighbors_distance = output_allocator.NeighborsDistance();
+}
+
+// Fixed-radius search using two tiled passes: one to count matches and one to
+// gather indices and optional distances.
+template <class T, class TIndex>
+void FixedRadiusSearchSYCL(const Tensor& points,
+                           const Tensor& queries,
+                           double radius,
+                           const Tensor& points_row_splits,
+                           const Tensor& queries_row_splits,
+                           const Tensor&,
+                           const Tensor&,
+                           const Tensor&,
+                           const Metric metric,
+                           const bool ignore_query_point,
+                           const bool return_distances,
+                           const bool,
+                           Tensor& neighbors_index,
+                           Tensor& neighbors_row_splits,
+                           Tensor& neighbors_distance) {
+    if (metric != Metric::L2) {
+        utility::LogError("SYCL fixed radius search only supports L2 metric.");
+    }
+    if (ignore_query_point) {
+        utility::LogError(
+                "SYCL fixed radius search does not support "
+                "ignore_query_point.");
+    }
+
+    const Device device = points.GetDevice();
+    const int64_t num_queries = queries.GetShape(0);
+    const Dtype dtype = points.GetDtype();
+    const T threshold = static_cast<T>(radius * radius);
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    Tensor counts = Tensor::Zeros({num_queries}, Int64, device);
+
+    int64_t tile_queries = 0;
+    int64_t tile_points = 0;
+    ChooseTileSizeSYCL(num_queries, points.GetShape(0), sizeof(T), tile_queries,
+                       tile_points);
+    Tensor temp_distances =
+            Tensor::Empty({tile_queries, tile_points}, dtype, device);
+
+    for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
+         ++batch_idx) {
+        const int64_t point_begin =
+                points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end =
+                points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t query_begin =
+                queries_row_splits[batch_idx].Item<int64_t>();
+        const int64_t query_end =
+                queries_row_splits[batch_idx + 1].Item<int64_t>();
+        const Tensor points_i = points.Slice(0, point_begin, point_end);
+        const Tensor queries_i = queries.Slice(0, query_begin, query_end);
+        const int64_t num_points_i = points_i.GetShape(0);
+        const int64_t num_queries_i = queries_i.GetShape(0);
+        Tensor point_norms = points_i.Mul(points_i).Sum({1});
+        Tensor query_norms = queries_i.Mul(queries_i).Sum({1});
+
+        for (int64_t q = 0; q < num_queries_i; q += tile_queries) {
+            const int64_t num_queries_iter =
+                    std::min(tile_queries, num_queries_i - q);
+            Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter);
+            Tensor query_norms_tile =
+                    query_norms.Slice(0, q, q + num_queries_iter);
+            int64_t* counts_ptr =
+                    counts.GetDataPtr<int64_t>() + query_begin + q;
+
+            for (int64_t p = 0; p < num_points_i; p += tile_points) {
+                const int64_t num_points_iter =
+                        std::min(tile_points, num_points_i - p);
+                Tensor points_tile = points_i.Slice(0, p, p + num_points_iter);
+                Tensor point_norms_tile =
+                        point_norms.Slice(0, p, p + num_points_iter);
+                Tensor temp_distances_view =
+                        temp_distances.Slice(0, 0, num_queries_iter)
+                                .Slice(1, 0, num_points_iter);
+
+                AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0,
+                      0.0);
+                temp_distances_view.Add_(
+                        point_norms_tile.View({1, num_points_iter}));
+                temp_distances_view.Add_(
+                        query_norms_tile.View({num_queries_iter, 1}));
+                CountWithinThresholdQueriesSYCL(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, threshold, counts_ptr);
+            }
+        }
+    }
+
+    queue.wait_and_throw();
+
+    Tensor counts_cpu = counts.To(Device("CPU:0"));
+    const int64_t* counts_cpu_ptr = counts_cpu.GetDataPtr<int64_t>();
+    std::vector<int64_t> row_splits(num_queries + 1, 0);
+    for (int64_t q = 0; q < num_queries; ++q) {
+        row_splits[q + 1] = row_splits[q] + counts_cpu_ptr[q];
+    }
+    neighbors_row_splits =
+            Tensor(row_splits, {num_queries + 1}, Int64).To(device);
+
+    std::vector<int64_t> row_offsets(row_splits.begin(), row_splits.end() - 1);
+    Tensor write_offsets = Tensor(row_offsets, {num_queries}, Int64).To(device);
+
+    NeighborSearchAllocator<T, TIndex> output_allocator(device);
+    TIndex* neighbors_index_ptr;
+    T* neighbors_distance_ptr;
+    output_allocator.AllocIndices(&neighbors_index_ptr, row_splits.back());
+    if (return_distances) {
+        output_allocator.AllocDistances(&neighbors_distance_ptr,
+                                        row_splits.back());
+    } else {
+        output_allocator.AllocDistances(&neighbors_distance_ptr, 0);
+    }
+
+    for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
+         ++batch_idx) {
+        const int64_t point_begin =
+                points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end =
+                points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t query_begin =
+                queries_row_splits[batch_idx].Item<int64_t>();
+        const int64_t query_end =
+                queries_row_splits[batch_idx + 1].Item<int64_t>();
+        const Tensor points_i = points.Slice(0, point_begin, point_end);
+        const Tensor queries_i = queries.Slice(0, query_begin, query_end);
+        const int64_t num_points_i = points_i.GetShape(0);
+        const int64_t num_queries_i = queries_i.GetShape(0);
+        Tensor point_norms = points_i.Mul(points_i).Sum({1});
+        Tensor query_norms = queries_i.Mul(queries_i).Sum({1});
+
+        for (int64_t q = 0; q < num_queries_i; q += tile_queries) {
+            const int64_t num_queries_iter =
+                    std::min(tile_queries, num_queries_i - q);
+            Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter);
+            Tensor query_norms_tile =
+                    query_norms.Slice(0, q, q + num_queries_iter);
+            int64_t* offsets_ptr =
+                    write_offsets.GetDataPtr<int64_t>() + query_begin + q;
+
+            for (int64_t p = 0; p < num_points_i; p += tile_points) {
+                const int64_t num_points_iter =
+                        std::min(tile_points, num_points_i - p);
+                Tensor points_tile = points_i.Slice(0, p, p + num_points_iter);
+                Tensor point_norms_tile =
+                        point_norms.Slice(0, p, p + num_points_iter);
+                Tensor temp_distances_view =
+                        temp_distances.Slice(0, 0, num_queries_iter)
+                                .Slice(1, 0, num_points_iter);
+
+                AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0,
+                      0.0);
+                temp_distances_view.Add_(
+                        point_norms_tile.View({1, num_points_iter}));
+                temp_distances_view.Add_(
+                        query_norms_tile.View({num_queries_iter, 1}));
+                GatherWithinThresholdQueriesSYCL<T, TIndex>(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, threshold, TIndex(point_begin + p),
+                        offsets_ptr, neighbors_index_ptr,
+                        return_distances ? neighbors_distance_ptr : nullptr);
+            }
+        }
+    }
+
+    queue.wait_and_throw();
+
+    neighbors_index = output_allocator.NeighborsIndex();
+    neighbors_distance = output_allocator.NeighborsDistance();
+}
+
+// Hybrid search counts all neighbors within radius while keeping only the best
+// max_knn results per query in fixed-size output tensors.
+template <class T, class TIndex>
+void HybridSearchSYCL(const Tensor& points,
+                      const Tensor& queries,
+                      double radius,
+                      int max_knn,
+                      const Tensor& points_row_splits,
+                      const Tensor& queries_row_splits,
+                      const Tensor&,
+                      const Tensor&,
+                      const Tensor&,
+                      const Metric metric,
+                      Tensor& neighbors_index,
+                      Tensor& neighbors_count,
+                      Tensor& neighbors_distance) {
+    if (metric != Metric::L2) {
+        utility::LogError("SYCL hybrid search only supports L2 metric.");
+    }
+
+    const Device device = points.GetDevice();
+    const int64_t num_queries = queries.GetShape(0);
+    const Dtype dtype = points.GetDtype();
+    const T threshold = static_cast<T>(radius * radius);
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    NeighborSearchAllocator<T, TIndex> output_allocator(device);
+    TIndex* neighbors_index_ptr;
+    T* neighbors_distance_ptr;
+    output_allocator.AllocIndices(&neighbors_index_ptr, num_queries * max_knn,
+                                  TIndex(-1));
+    output_allocator.AllocDistances(&neighbors_distance_ptr,
+                                    num_queries * max_knn,
+                                    std::numeric_limits<T>::max());
+
+    Tensor counts = Tensor::Zeros({num_queries}, Int64, device);
+    Tensor out_indices =
+            output_allocator.NeighborsIndex().View({num_queries, max_knn});
+    Tensor out_distances =
+            output_allocator.NeighborsDistance().View({num_queries, max_knn});
+
+    int64_t tile_queries = 0;
+    int64_t tile_points = 0;
+    ChooseTileSizeSYCL(num_queries, points.GetShape(0), sizeof(T), tile_queries,
+                       tile_points);
+    Tensor temp_distances =
+            Tensor::Empty({tile_queries, tile_points}, dtype, device);
+    Tensor tile_sort_indices = Tensor::Empty({tile_queries, tile_points},
+                                             Dtype::FromType<TIndex>(), device);
+    Tensor tile_top_indices = Tensor::Empty({tile_queries, max_knn},
+                                            Dtype::FromType<TIndex>(), device);
+    Tensor tile_top_distances =
+            Tensor::Empty({tile_queries, max_knn}, dtype, device);
+    Tensor merged_indices = Tensor::Empty({tile_queries, max_knn},
+                                          Dtype::FromType<TIndex>(), device);
+    Tensor merged_distances =
+            Tensor::Empty({tile_queries, max_knn}, dtype, device);
+    Tensor merge_sort_indices = Tensor::Empty(
+            {tile_queries, 2 * max_knn}, Dtype::FromType<TIndex>(), device);
+
+    for (int batch_idx = 0; batch_idx < points_row_splits.GetShape(0) - 1;
+         ++batch_idx) {
+        const int64_t point_begin =
+                points_row_splits[batch_idx].Item<int64_t>();
+        const int64_t point_end =
+                points_row_splits[batch_idx + 1].Item<int64_t>();
+        const int64_t query_begin =
+                queries_row_splits[batch_idx].Item<int64_t>();
+        const int64_t query_end =
+                queries_row_splits[batch_idx + 1].Item<int64_t>();
+        const Tensor points_i = points.Slice(0, point_begin, point_end);
+        const Tensor queries_i = queries.Slice(0, query_begin, query_end);
+        const int64_t num_points_i = points_i.GetShape(0);
+        const int64_t num_queries_i = queries_i.GetShape(0);
+        Tensor point_norms = points_i.Mul(points_i).Sum({1});
+        Tensor query_norms = queries_i.Mul(queries_i).Sum({1});
+
+        for (int64_t q = 0; q < num_queries_i; q += tile_queries) {
+            const int64_t num_queries_iter =
+                    std::min(tile_queries, num_queries_i - q);
+            Tensor queries_tile = queries_i.Slice(0, q, q + num_queries_iter);
+            Tensor query_norms_tile =
+                    query_norms.Slice(0, q, q + num_queries_iter);
+            Tensor best_indices_view = out_indices.Slice(
+                    0, query_begin + q, query_begin + q + num_queries_iter);
+            Tensor best_distances_view = out_distances.Slice(
+                    0, query_begin + q, query_begin + q + num_queries_iter);
+            int64_t* counts_ptr =
+                    counts.GetDataPtr<int64_t>() + query_begin + q;
+
+            for (int64_t p = 0; p < num_points_i; p += tile_points) {
+                const int64_t num_points_iter =
+                        std::min(tile_points, num_points_i - p);
+                Tensor points_tile = points_i.Slice(0, p, p + num_points_iter);
+                Tensor point_norms_tile =
+                        point_norms.Slice(0, p, p + num_points_iter);
+                Tensor temp_distances_view =
+                        temp_distances.Slice(0, 0, num_queries_iter)
+                                .Slice(1, 0, num_points_iter);
+
+                AddMM(queries_tile, points_tile.T(), temp_distances_view, -2.0,
+                      0.0);
+                temp_distances_view.Add_(
+                        point_norms_tile.View({1, num_points_iter}));
+                temp_distances_view.Add_(
+                        query_norms_tile.View({num_queries_iter, 1}));
+
+                CountWithinThresholdQueriesSYCL(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, threshold, counts_ptr);
+
+                Tensor tile_top_indices_view =
+                        tile_top_indices.Slice(0, 0, num_queries_iter);
+                Tensor tile_top_distances_view =
+                        tile_top_distances.Slice(0, 0, num_queries_iter);
+                Tensor merged_indices_view =
+                        merged_indices.Slice(0, 0, num_queries_iter);
+                Tensor merged_distances_view =
+                        merged_distances.Slice(0, 0, num_queries_iter);
+
+                SelectTopKQueriesSYCL<T, TIndex>(
+                        device, temp_distances_view.GetDataPtr<T>(),
+                        temp_distances_view.GetStride(0), num_queries_iter,
+                        num_points_iter, max_knn, TIndex(point_begin + p),
+                        tile_sort_indices.GetDataPtr<TIndex>(),
+                        tile_sort_indices.GetStride(0),
+                        tile_top_indices_view.GetDataPtr<TIndex>(),
+                        tile_top_distances_view.GetDataPtr<T>(), max_knn, true,
+                        threshold);
+                MergeTopKQueriesSYCL<T, TIndex>(
+                        device, best_distances_view.GetDataPtr<T>(),
+                        best_indices_view.GetDataPtr<TIndex>(), max_knn,
+                        tile_top_distances_view.GetDataPtr<T>(),
+                        tile_top_indices_view.GetDataPtr<TIndex>(), max_knn,
+                        num_queries_iter, max_knn,
+                        merge_sort_indices.GetDataPtr<TIndex>(),
+                        merge_sort_indices.GetStride(0),
+                        merged_indices_view.GetDataPtr<TIndex>(),
+                        merged_distances_view.GetDataPtr<T>(), max_knn);
+
+                best_indices_view.AsRvalue() = merged_indices_view;
+                best_distances_view.AsRvalue() = merged_distances_view;
+            }
+        }
+    }
+
+    neighbors_index = output_allocator.NeighborsIndex();
+    neighbors_distance = output_allocator.NeighborsDistance();
+    neighbors_count =
+            Tensor::Empty({num_queries}, Dtype::FromType<TIndex>(), device);
+    FinalizeHybridResultsSYCL<T, TIndex>(device, counts.GetDataPtr<int64_t>(),
+                                         num_queries, max_knn,
+                                         neighbors_index.GetDataPtr<TIndex>(),
+                                         neighbors_distance.GetDataPtr<T>(),
+                                         neighbors_count.GetDataPtr<TIndex>());
+    queue.wait_and_throw();
+}
+
+#define INSTANTIATE(T, TIndex)                                                 \
+    template void KnnSearchSYCL<T, TIndex>(                                    \
+            const Tensor& points, const Tensor& points_row_splits,             \
+            const Tensor& queries, const Tensor& queries_row_splits, int knn,  \
+            Tensor& neighbors_index, Tensor& neighbors_row_splits,             \
+            Tensor& neighbors_distance);                                       \
+    template void FixedRadiusSearchSYCL<T, TIndex>(                            \
+            const Tensor& points, const Tensor& queries, double radius,        \
+            const Tensor& points_row_splits, const Tensor& queries_row_splits, \
+            const Tensor&, const Tensor&, const Tensor&, const Metric metric,  \
+            const bool ignore_query_point, const bool return_distances,        \
+            const bool sort, Tensor& neighbors_index,                          \
+            Tensor& neighbors_row_splits, Tensor& neighbors_distance);         \
+    template void HybridSearchSYCL<T, TIndex>(                                 \
+            const Tensor& points, const Tensor& queries, double radius,        \
+            int max_knn, const Tensor& points_row_splits,                      \
+            const Tensor& queries_row_splits, const Tensor&, const Tensor&,    \
+            const Tensor&, const Metric metric, Tensor& neighbors_index,       \
+            Tensor& neighbors_count, Tensor& neighbors_distance);
+
+INSTANTIATE(float, int32_t)
+INSTANTIATE(float, int64_t)
+INSTANTIATE(double, int32_t)
+INSTANTIATE(double, int64_t)
+
+}  // namespace nns
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.cpp b/cpp/open3d/core/nns/NearestNeighborSearch.cpp
index 9cc8030a35b..f7889f737bb 100644
--- a/cpp/open3d/core/nns/NearestNeighborSearch.cpp
+++ b/cpp/open3d/core/nns/NearestNeighborSearch.cpp
@@ -21,21 +21,18 @@ bool NearestNeighborSearch::SetIndex() {
 };
 
 bool NearestNeighborSearch::KnnIndex() {
-    if (dataset_points_.IsCUDA()) {
-#ifdef BUILD_CUDA_MODULE
+    if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) {
         knn_index_.reset(new nns::KnnIndex());
         return knn_index_->SetTensorData(dataset_points_, index_dtype_);
-#else
-        utility::LogError(
-                "-DBUILD_CUDA_MODULE=OFF. Please recompile Open3D with "
-                "-DBUILD_CUDA_MODULE=ON.");
-#endif
     } else {
         return SetIndex();
     }
 };
 
-bool NearestNeighborSearch::MultiRadiusIndex() { return SetIndex(); };
+bool NearestNeighborSearch::MultiRadiusIndex() {
+    AssertCPU(dataset_points_);
+    return SetIndex();
+};
 
 bool NearestNeighborSearch::FixedRadiusIndex(std::optional<double> radius) {
     if (dataset_points_.IsCUDA()) {
@@ -53,6 +50,15 @@ bool NearestNeighborSearch::FixedRadiusIndex(std::optional<double> radius) {
 #endif
 
     } else {
+        if (dataset_points_.IsSYCL()) {
+            if (!radius.has_value()) {
+                utility::LogError(
+                        "radius is required for SYCL FixedRadiusIndex.");
+            }
+            fixed_radius_index_.reset(new nns::FixedRadiusIndex());
+            return fixed_radius_index_->SetTensorData(
+                    dataset_points_, radius.value(), index_dtype_);
+        }
         return SetIndex();
     }
 }
@@ -72,6 +78,14 @@ bool NearestNeighborSearch::HybridIndex(std::optional<double> radius) {
 #endif
 
     } else {
+        if (dataset_points_.IsSYCL()) {
+            if (!radius.has_value()) {
+                utility::LogError("radius is required for SYCL HybridIndex.");
+            }
+            fixed_radius_index_.reset(new nns::FixedRadiusIndex());
+            return fixed_radius_index_->SetTensorData(
+                    dataset_points_, radius.value(), index_dtype_);
+        }
         return SetIndex();
     }
 };
@@ -80,7 +94,7 @@ std::pair<Tensor, Tensor> NearestNeighborSearch::KnnSearch(
         const Tensor& query_points, int knn) {
     AssertTensorDevice(query_points, dataset_points_.GetDevice());
 
-    if (dataset_points_.IsCUDA()) {
+    if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) {
         if (knn_index_) {
             return knn_index_->SearchKnn(query_points, knn);
         } else {
@@ -99,7 +113,7 @@ std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::FixedRadiusSearch(
         const Tensor& query_points, double radius, bool sort) {
     AssertTensorDevice(query_points, dataset_points_.GetDevice());
 
-    if (dataset_points_.IsCUDA()) {
+    if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) {
         if (fixed_radius_index_) {
             return fixed_radius_index_->SearchRadius(query_points, radius,
                                                      sort);
@@ -117,7 +131,7 @@ std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::FixedRadiusSearch(
 
 std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::MultiRadiusSearch(
         const Tensor& query_points, const Tensor& radii) {
-    AssertNotCUDA(query_points);
+    AssertCPU(query_points);
     AssertTensorDtype(query_points, dataset_points_.GetDtype());
     AssertTensorDtype(radii, dataset_points_.GetDtype());
 
@@ -133,7 +147,7 @@ std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::HybridSearch(
         const int max_knn) const {
     AssertTensorDevice(query_points, dataset_points_.GetDevice());
 
-    if (dataset_points_.IsCUDA()) {
+    if (dataset_points_.IsCUDA() || dataset_points_.IsSYCL()) {
         if (fixed_radius_index_) {
             return fixed_radius_index_->SearchHybrid(query_points, radius,
                                                      max_knn);
@@ -150,11 +164,11 @@ std::tuple<Tensor, Tensor, Tensor> NearestNeighborSearch::HybridSearch(
     }
 }
 
-void NearestNeighborSearch::AssertNotCUDA(const Tensor& t) const {
-    if (t.IsCUDA()) {
+void NearestNeighborSearch::AssertCPU(const Tensor& t) const {
+    if (!t.IsCPU()) {
         utility::LogError(
-                "TODO: NearestNeighborSearch does not support CUDA tensor "
-                "yet.");
+                "NearestNeighborSearch only supports CPU tensors for this "
+                "operation.");
     }
 }
 
diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.h b/cpp/open3d/core/nns/NearestNeighborSearch.h
index b9fe14ad1ed..aa22ca352b5 100644
--- a/cpp/open3d/core/nns/NearestNeighborSearch.h
+++ b/cpp/open3d/core/nns/NearestNeighborSearch.h
@@ -27,14 +27,12 @@ class NearestNeighborSearch {
     /// Constructor.
     ///
     /// \param dataset_points Dataset points for constructing search index. Must
-    /// be 2D, with shape {n, d}. SYCL tensors are not yet supported.
+    /// be 2D, with shape {n, d}.
     // NearestNeighborSearch(const Tensor &dataset_points)
     //     : dataset_points_(dataset_points){};
     NearestNeighborSearch(const Tensor &dataset_points,
                           const Dtype &index_dtype = core::Int32)
-        : dataset_points_(dataset_points), index_dtype_(index_dtype) {
-        AssertNotSYCL(dataset_points_);
-    };
+        : dataset_points_(dataset_points), index_dtype_(index_dtype) {}
     ~NearestNeighborSearch();
     NearestNeighborSearch(const NearestNeighborSearch &) = delete;
     NearestNeighborSearch &operator=(const NearestNeighborSearch &) = delete;
@@ -47,17 +45,22 @@ class NearestNeighborSearch {
 
     /// Set index for multi-radius search.
     ///
+    /// Multi-radius search is currently only supported for CPU tensors.
+    ///
     /// \return Returns true if building index success, otherwise false.
     bool MultiRadiusIndex();
 
     /// Set index for fixed-radius search.
     ///
-    /// \param radius optional radius parameter. required for gpu fixed radius
-    /// index. \return Returns true if building index success, otherwise false.
+    /// \param radius Optional radius parameter. Required for CUDA and SYCL
+    /// fixed radius indices.
+    /// \return Returns true if building index success, otherwise false.
     bool FixedRadiusIndex(std::optional<double> radius = {});
 
     /// Set index for hybrid search.
     ///
+    /// \param radius Optional radius parameter. Required for CUDA and SYCL
+    /// hybrid indices.
     /// \return Returns true if building index success, otherwise false.
     bool HybridIndex(std::optional<double> radius = {});
 
@@ -88,6 +91,7 @@ class NearestNeighborSearch {
             const Tensor &query_points, double radius, bool sort = true);
 
     /// Perform multi-radius search. Each query point has an independent radius.
+    /// Only CPU tensors are currently supported for this search mode.
     ///
     /// \param query_points Query points. Must be 2D, with shape {n, d}.
     /// \param radii Radii of query points. Each query point has one radius.
@@ -121,8 +125,8 @@ class NearestNeighborSearch {
 private:
     bool SetIndex();
 
-    /// Assert a Tensor is not CUDA tensoer. This will be removed in the future.
-    void AssertNotCUDA(const Tensor &t) const;
+    /// Assert a Tensor is on CPU.
+    void AssertCPU(const Tensor &t) const;
 
 protected:
     std::unique_ptr<NanoFlannIndex> nanoflann_index_;
diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
index 223c4d4c7a8..560745b6b5d 100644
--- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
@@ -30,6 +30,12 @@ if (BUILD_CUDA_MODULE)
     )
 endif()
 
+if (BUILD_SYCL_MODULE)
+    open3d_sycl_target_sources(tgeometry_kernel PRIVATE
+        TransformSYCL.cpp
+    )
+endif()
+
 if (WITH_IPP)
     target_sources(tgeometry_kernel PRIVATE
         IPPImage.cpp
diff --git a/cpp/open3d/t/geometry/kernel/Transform.cpp b/cpp/open3d/t/geometry/kernel/Transform.cpp
index 06ec1de454f..185be29bfc2 100644
--- a/cpp/open3d/t/geometry/kernel/Transform.cpp
+++ b/cpp/open3d/t/geometry/kernel/Transform.cpp
@@ -31,6 +31,12 @@ void TransformPoints(const core::Tensor& transformation, core::Tensor& points) {
     } else if (points.IsCUDA()) {
         CUDA_CALL(TransformPointsCUDA, transformation_contiguous,
                   points_contiguous);
+    } else if (points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TransformPointsSYCL(transformation_contiguous, points_contiguous);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device");
     }
@@ -53,6 +59,12 @@ void TransformNormals(const core::Tensor& transformation,
     } else if (normals.IsCUDA()) {
         CUDA_CALL(TransformNormalsCUDA, transformation_contiguous,
                   normals_contiguous);
+    } else if (normals.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TransformNormalsSYCL(transformation_contiguous, normals_contiguous);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device");
     }
@@ -78,6 +90,12 @@ void RotatePoints(const core::Tensor& R,
     } else if (points.IsCUDA()) {
         CUDA_CALL(RotatePointsCUDA, R_contiguous, points_contiguous,
                   center_contiguous);
+    } else if (points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        RotatePointsSYCL(R_contiguous, points_contiguous, center_contiguous);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device");
     }
@@ -97,6 +115,12 @@ void RotateNormals(const core::Tensor& R, core::Tensor& normals) {
         RotateNormalsCPU(R_contiguous, normals_contiguous);
     } else if (normals.IsCUDA()) {
         CUDA_CALL(RotateNormalsCUDA, R_contiguous, normals_contiguous);
+    } else if (normals.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        RotateNormalsSYCL(R_contiguous, normals_contiguous);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device");
     }
diff --git a/cpp/open3d/t/geometry/kernel/Transform.h b/cpp/open3d/t/geometry/kernel/Transform.h
index d64d860572e..e51ebae73c4 100644
--- a/cpp/open3d/t/geometry/kernel/Transform.h
+++ b/cpp/open3d/t/geometry/kernel/Transform.h
@@ -52,6 +52,20 @@ void RotatePointsCUDA(const core::Tensor& R,
 void RotateNormalsCUDA(const core::Tensor& R, core::Tensor& normals);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void TransformPointsSYCL(const core::Tensor& transformation,
+                         core::Tensor& points);
+
+void TransformNormalsSYCL(const core::Tensor& transformation,
+                          core::Tensor& normals);
+
+void RotatePointsSYCL(const core::Tensor& R,
+                      core::Tensor& points,
+                      const core::Tensor& center);
+
+void RotateNormalsSYCL(const core::Tensor& R, core::Tensor& normals);
+#endif
+
 }  // namespace transform
 }  // namespace kernel
 }  // namespace geometry
diff --git a/cpp/open3d/t/geometry/kernel/TransformImpl.h b/cpp/open3d/t/geometry/kernel/TransformImpl.h
index 14c08a01e52..cedcc22fd87 100644
--- a/cpp/open3d/t/geometry/kernel/TransformImpl.h
+++ b/cpp/open3d/t/geometry/kernel/TransformImpl.h
@@ -87,6 +87,8 @@ OPEN3D_HOST_DEVICE OPEN3D_FORCE_INLINE void RotateNormalsKernel(
     normals_ptr[2] = x[2];
 }
 
+#ifndef OPEN3D_SKIP_TRANSFORM_MAIN
+
 #ifdef __CUDACC__
 void TransformPointsCUDA
 #else
@@ -167,6 +169,8 @@ void RotateNormalsCPU
     });
 }
 
+#endif  // OPEN3D_SKIP_TRANSFORM_MAIN
+
 }  // namespace transform
 }  // namespace kernel
 }  // namespace geometry
diff --git a/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp b/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp
new file mode 100644
index 00000000000..9bfd9078e3a
--- /dev/null
+++ b/cpp/open3d/t/geometry/kernel/TransformSYCL.cpp
@@ -0,0 +1,104 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+// Skip the CPU/CUDA main function definitions; only use the helper templates.
+#define OPEN3D_SKIP_TRANSFORM_MAIN
+#include "open3d/t/geometry/kernel/TransformImpl.h"
+#undef OPEN3D_SKIP_TRANSFORM_MAIN
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+
+namespace open3d {
+namespace t {
+namespace geometry {
+namespace kernel {
+namespace transform {
+
+void TransformPointsSYCL(const core::Tensor& transformation,
+                         core::Tensor& points) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(
+                    points.GetDevice());
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(points.GetDtype(), [&]() {
+        scalar_t* points_ptr = points.GetDataPtr<scalar_t>();
+        const scalar_t* transformation_ptr =
+                transformation.GetDataPtr<scalar_t>();
+        const int64_t n = points.GetLength();
+        queue.parallel_for(sycl::range<1>{(size_t)n},
+                           [=](sycl::id<1> id) {
+                               TransformPointsKernel(
+                                       transformation_ptr,
+                                       points_ptr + 3 * id[0]);
+                           })
+                .wait_and_throw();
+    });
+}
+
+void TransformNormalsSYCL(const core::Tensor& transformation,
+                          core::Tensor& normals) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(
+                    normals.GetDevice());
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(normals.GetDtype(), [&]() {
+        scalar_t* normals_ptr = normals.GetDataPtr<scalar_t>();
+        const scalar_t* transformation_ptr =
+                transformation.GetDataPtr<scalar_t>();
+        const int64_t n = normals.GetLength();
+        queue.parallel_for(sycl::range<1>{(size_t)n},
+                           [=](sycl::id<1> id) {
+                               TransformNormalsKernel(
+                                       transformation_ptr,
+                                       normals_ptr + 3 * id[0]);
+                           })
+                .wait_and_throw();
+    });
+}
+
+void RotatePointsSYCL(const core::Tensor& R,
+                      core::Tensor& points,
+                      const core::Tensor& center) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(
+                    points.GetDevice());
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(points.GetDtype(), [&]() {
+        scalar_t* points_ptr = points.GetDataPtr<scalar_t>();
+        const scalar_t* R_ptr = R.GetDataPtr<scalar_t>();
+        const scalar_t* center_ptr = center.GetDataPtr<scalar_t>();
+        const int64_t n = points.GetLength();
+        queue.parallel_for(sycl::range<1>{(size_t)n},
+                           [=](sycl::id<1> id) {
+                               RotatePointsKernel(R_ptr,
+                                                  points_ptr + 3 * id[0],
+                                                  center_ptr);
+                           })
+                .wait_and_throw();
+    });
+}
+
+void RotateNormalsSYCL(const core::Tensor& R, core::Tensor& normals) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(
+                    normals.GetDevice());
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(normals.GetDtype(), [&]() {
+        scalar_t* normals_ptr = normals.GetDataPtr<scalar_t>();
+        const scalar_t* R_ptr = R.GetDataPtr<scalar_t>();
+        const int64_t n = normals.GetLength();
+        queue.parallel_for(
+                     sycl::range<1>{(size_t)n},
+                     [=](sycl::id<1> id) {
+                         RotateNormalsKernel(R_ptr, normals_ptr + 3 * id[0]);
+                     })
+                .wait_and_throw();
+    });
+}
+
+}  // namespace transform
+}  // namespace kernel
+}  // namespace geometry
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
index a766715e6e7..785d58c85e4 100644
--- a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
@@ -22,6 +22,16 @@ if (BUILD_CUDA_MODULE)
     )
 endif()
 
+if (BUILD_SYCL_MODULE)
+    open3d_sycl_target_sources(tpipelines_kernel PRIVATE
+        RegistrationSYCL.cpp
+        FillInLinearSystemSYCL.cpp
+        RGBDOdometrySYCL.cpp
+        TransformationConverterSYCL.cpp
+        FeatureSYCL.cpp
+    )
+endif()
+
 open3d_show_and_abort_on_warning(tpipelines_kernel)
 open3d_set_global_properties(tpipelines_kernel)
 # The kernels are used in the unit tests, so they cannot be hidden for now.
diff --git a/cpp/open3d/t/pipelines/kernel/Feature.cpp b/cpp/open3d/t/pipelines/kernel/Feature.cpp
index c73573054aa..5a779ecc48b 100644
--- a/cpp/open3d/t/pipelines/kernel/Feature.cpp
+++ b/cpp/open3d/t/pipelines/kernel/Feature.cpp
@@ -44,10 +44,20 @@ void ComputeFPFHFeature(
     if (points_d.IsCPU()) {
         ComputeFPFHFeatureCPU(points_d, normals_d, indices, distance2, counts_d,
                               fpfhs, mask, map_info_idx_to_point_idx);
-    } else {
+    } else if (points_d.IsCUDA()) {
         core::CUDAScopedDevice scoped_device(points.GetDevice());
         CUDA_CALL(ComputeFPFHFeatureCUDA, points_d, normals_d, indices,
                   distance2, counts_d, fpfhs, mask, map_info_idx_to_point_idx);
+    } else if (points_d.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeFPFHFeatureSYCL(points_d, normals_d, indices, distance2,
+                               counts_d, fpfhs, mask,
+                               map_info_idx_to_point_idx);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
+    } else {
+        utility::LogError("Unimplemented device.");
     }
     utility::LogDebug(
             "[ComputeFPFHFeature] Computed {:d} features from "
diff --git a/cpp/open3d/t/pipelines/kernel/Feature.h b/cpp/open3d/t/pipelines/kernel/Feature.h
index 6645c9bb6cc..cc1d66f6920 100644
--- a/cpp/open3d/t/pipelines/kernel/Feature.h
+++ b/cpp/open3d/t/pipelines/kernel/Feature.h
@@ -49,6 +49,19 @@ void ComputeFPFHFeatureCUDA(
                 std::nullopt);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void ComputeFPFHFeatureSYCL(
+        const core::Tensor &points,
+        const core::Tensor &normals,
+        const core::Tensor &indices,
+        const core::Tensor &distance2,
+        const core::Tensor &counts,
+        core::Tensor &fpfhs,
+        const std::optional<core::Tensor> &mask = std::nullopt,
+        const std::optional<core::Tensor> &map_batch_info_idx_to_point_idx =
+                std::nullopt);
+#endif
+
 }  // namespace kernel
 }  // namespace pipelines
 }  // namespace t
diff --git a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
index c9948b6a62d..5caef81ff49 100644
--- a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
@@ -104,6 +104,7 @@ OPEN3D_HOST_DEVICE void UpdateSPFHFeature(const scalar_t *feature,
     spfh[idx * 33 + h_index3 + 22] += hist_incr;
 }
 
+#ifndef OPEN3D_SKIP_FPFH_MAIN
 #if defined(__CUDACC__)
 void ComputeFPFHFeatureCUDA
 #else
@@ -291,6 +292,7 @@ void ComputeFPFHFeatureCPU
                 });
     });
 }
+#endif  // OPEN3D_SKIP_FPFH_MAIN
 
 }  // namespace kernel
 }  // namespace pipelines
diff --git a/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp
new file mode 100644
index 00000000000..8c41acf7aab
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/FeatureSYCL.cpp
@@ -0,0 +1,205 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+// Skip the CPU/CUDA main function definitions; only use the helper templates.
+#define OPEN3D_SKIP_FPFH_MAIN
+#include "open3d/t/pipelines/kernel/FeatureImpl.h"
+#undef OPEN3D_SKIP_FPFH_MAIN
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+
+void ComputeFPFHFeatureSYCL(
+        const core::Tensor &points,
+        const core::Tensor &normals,
+        const core::Tensor &indices,
+        const core::Tensor &distance2,
+        const core::Tensor &counts,
+        core::Tensor &fpfhs,
+        const std::optional<core::Tensor> &mask,
+        const std::optional<core::Tensor> &map_info_idx_to_point_idx) {
+    const core::Dtype dtype = points.GetDtype();
+    const core::Device device = points.GetDevice();
+    const int64_t n_points = points.GetLength();
+
+    const bool filter_fpfh =
+            mask.has_value() && map_info_idx_to_point_idx.has_value();
+    if (mask.has_value() ^ map_info_idx_to_point_idx.has_value()) {
+        utility::LogError(
+                "Parameters mask and map_info_idx_to_point_idx must "
+                "either be both provided or both not provided.");
+    }
+    if (filter_fpfh) {
+        if (mask.value().GetShape()[0] != n_points) {
+            utility::LogError(
+                    "Parameter mask was provided, but its size {:d} should"
+                    "be equal to the number of points {:d}.",
+                    (int)mask.value().GetShape()[0], n_points);
+        }
+        if (map_info_idx_to_point_idx.value().GetShape()[0] !=
+            counts.GetShape()[0] - (indices.GetShape().size() == 1 ? 1 : 0)) {
+            utility::LogError(
+                    "Parameter map_info_idx_to_point_idx was provided, "
+                    "but its size"
+                    "{:d} should be equal to the size of counts {:d}.",
+                    (int)map_info_idx_to_point_idx.value().GetShape()[0],
+                    (int)counts.GetShape()[0]);
+        }
+    }
+
+    core::Tensor map_spfh_info_idx_to_point_idx =
+            map_info_idx_to_point_idx.value_or(
+                    core::Tensor::Empty({0}, core::Int64, device));
+
+    const core::Tensor map_fpfh_idx_to_point_idx =
+            filter_fpfh ? mask.value().NonZero().GetItem(
+                                  {core::TensorKey::Index(0)})
+                        : core::Tensor::Empty({0}, core::Int64, device);
+
+    const int32_t n_fpfh =
+            filter_fpfh ? map_fpfh_idx_to_point_idx.GetLength() : n_points;
+    const int32_t n_spfh =
+            filter_fpfh ? map_spfh_info_idx_to_point_idx.GetLength() : n_points;
+
+    core::Tensor spfhs =
+            core::Tensor::Zeros({n_spfh, 33}, dtype, fpfhs.GetDevice());
+
+    core::Tensor map_point_idx_to_spfh_idx;
+    if (filter_fpfh) {
+        map_point_idx_to_spfh_idx = core::Tensor::Full(
+                {n_points}, -1, core::Int64, fpfhs.GetDevice());
+        map_point_idx_to_spfh_idx.IndexSet(
+                {map_spfh_info_idx_to_point_idx},
+                core::Tensor::Arange(0, n_spfh, 1, core::Int64,
+                                     fpfhs.GetDevice()));
+    } else {
+        map_point_idx_to_spfh_idx =
+                core::Tensor::Empty({0}, core::Int64, fpfhs.GetDevice());
+    }
+
+    // Check the nns type (knn = hybrid = false, radius = true).
+    // The nns radius search mode will resulting a prefix sum 1D tensor.
+    bool is_radius_search;
+    int nn_size = 0;
+    if (indices.GetShape().size() == 1) {
+        is_radius_search = true;
+    } else {
+        is_radius_search = false;
+        nn_size = indices.GetShape()[1];
+    }
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        const scalar_t *points_ptr = points.GetDataPtr<scalar_t>();
+        const scalar_t *normals_ptr = normals.GetDataPtr<scalar_t>();
+        const int32_t *indices_ptr = indices.GetDataPtr<int32_t>();
+        const scalar_t *distance2_ptr = distance2.GetDataPtr<scalar_t>();
+        const int32_t *counts_ptr = counts.GetDataPtr<int32_t>();
+        scalar_t *spfhs_ptr = spfhs.GetDataPtr<scalar_t>();
+        scalar_t *fpfhs_ptr = fpfhs.GetDataPtr<scalar_t>();
+        const int64_t *map_spfh_info_idx_to_point_idx_ptr =
+                map_spfh_info_idx_to_point_idx.GetDataPtr<int64_t>();
+        const int64_t *map_fpfh_idx_to_point_idx_ptr =
+                map_fpfh_idx_to_point_idx.GetDataPtr<int64_t>();
+        const int64_t *map_point_idx_to_spfh_idx_ptr =
+                map_point_idx_to_spfh_idx.GetDataPtr<int64_t>();
+
+        // Compute SPFH features for the points.
+        queue.parallel_for(sycl::range<1>{(size_t)n_spfh}, [=](sycl::id<1> id) {
+                 int64_t workload_idx = id[0];
+                 int64_t workload_point_idx =
+                         filter_fpfh ? map_spfh_info_idx_to_point_idx_ptr
+                                               [workload_idx]
+                                     : workload_idx;
+                 int64_t idx = 3 * workload_point_idx;
+                 const scalar_t *point = points_ptr + idx;
+                 const scalar_t *normal = normals_ptr + idx;
+
+                 const int indice_size =
+                         is_radius_search ? (counts_ptr[workload_idx + 1] -
+                                             counts_ptr[workload_idx])
+                                          : counts_ptr[workload_idx];
+
+                 if (indice_size > 1) {
+                     const scalar_t hist_incr =
+                             100.0 / static_cast<scalar_t>(indice_size - 1);
+                     for (int i = 1; i < indice_size; i++) {
+                         const int point_idx =
+                                 is_radius_search
+                                         ? indices_ptr[i +
+                                                       counts_ptr[workload_idx]]
+                                         : indices_ptr[workload_idx * nn_size +
+                                                       i];
+
+                         const scalar_t *point_ref = points_ptr + 3 * point_idx;
+                         const scalar_t *normal_ref =
+                                 normals_ptr + 3 * point_idx;
+                         scalar_t fea[4] = {0};
+                         ComputePairFeature<scalar_t>(point, normal, point_ref,
+                                                      normal_ref, fea);
+                         UpdateSPFHFeature<scalar_t>(fea, workload_idx,
+                                                     hist_incr, spfhs_ptr);
+                     }
+                 }
+             }).wait_and_throw();
+
+        // Compute FPFH features for the points.
+        queue.parallel_for(sycl::range<1>{(size_t)n_fpfh}, [=](sycl::id<1> id) {
+                 int64_t workload_idx = id[0];
+                 int64_t workload_spfh_idx =
+                         filter_fpfh ? map_point_idx_to_spfh_idx_ptr
+                                               [map_fpfh_idx_to_point_idx_ptr
+                                                        [workload_idx]]
+                                     : workload_idx;
+                 const int indice_size =
+                         is_radius_search ? (counts_ptr[workload_spfh_idx + 1] -
+                                             counts_ptr[workload_spfh_idx])
+                                          : counts_ptr[workload_spfh_idx];
+                 if (indice_size > 1) {
+                     scalar_t sum[3] = {0.0, 0.0, 0.0};
+                     for (int i = 1; i < indice_size; i++) {
+                         const int idx =
+                                 is_radius_search
+                                         ? i + counts_ptr[workload_spfh_idx]
+                                         : workload_spfh_idx * nn_size + i;
+                         const scalar_t dist = distance2_ptr[idx];
+                         if (dist == 0.0) continue;
+                         const int32_t spfh_idx =
+                                 filter_fpfh ? map_point_idx_to_spfh_idx_ptr
+                                                       [indices_ptr[idx]]
+                                             : indices_ptr[idx];
+                         for (int j = 0; j < 33; j++) {
+                             const scalar_t val =
+                                     spfhs_ptr[spfh_idx * 33 + j] / dist;
+                             sum[j / 11] += val;
+                             fpfhs_ptr[workload_idx * 33 + j] += val;
+                         }
+                     }
+                     for (int j = 0; j < 3; j++) {
+                         sum[j] = sum[j] != 0.0 ? 100.0 / sum[j] : 0.0;
+                     }
+                     for (int j = 0; j < 33; j++) {
+                         fpfhs_ptr[workload_idx * 33 + j] *= sum[j / 11];
+                         fpfhs_ptr[workload_idx * 33 + j] +=
+                                 spfhs_ptr[workload_spfh_idx * 33 + j];
+                     }
+                 }
+             }).wait_and_throw();
+    });
+}
+
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
index 9d5e825da9a..f9e358d6636 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.cpp
@@ -60,6 +60,13 @@ void FillInRigidAlignmentTerm(core::Tensor &AtA,
 
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (AtA.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        FillInRigidAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs,
+                                     Ri_normal_ps, i, j, threshold);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device");
@@ -124,6 +131,15 @@ void FillInSLACAlignmentTerm(core::Tensor &AtA,
 
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (AtA.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        FillInSLACAlignmentTermSYCL(AtA, Atb, residual, Ti_ps, Tj_qs, normal_ps,
+                                    Ri_normal_ps, RjT_Ri_normal_ps,
+                                    cgrid_idx_ps, cgrid_idx_qs, cgrid_ratio_ps,
+                                    cgrid_ratio_qs, i, j, n, threshold);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device");
@@ -163,6 +179,14 @@ void FillInSLACRegularizerTerm(core::Tensor &AtA,
                 positions_init, positions_curr, weight, n, anchor_idx);
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (AtA.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        FillInSLACRegularizerTermSYCL(
+                AtA, Atb, residual, grid_idx, grid_nbs_idx, grid_nbs_mask,
+                positions_init, positions_curr, weight, n, anchor_idx);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device");
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
index 1c529813841..69da827ab61 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystem.h
@@ -130,6 +130,47 @@ void FillInSLACRegularizerTermCUDA(core::Tensor &AtA,
                                    int n,
                                    int anchor_idx);
 
+#endif
+#ifdef BUILD_SYCL_MODULE
+void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
+                                  core::Tensor &Atb,
+                                  core::Tensor &residual,
+                                  const core::Tensor &Ti_qs,
+                                  const core::Tensor &Tj_qs,
+                                  const core::Tensor &Ri_normal_ps,
+                                  int i,
+                                  int j,
+                                  float threshold);
+
+void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
+                                 core::Tensor &Atb,
+                                 core::Tensor &residual,
+                                 const core::Tensor &Ti_qs,
+                                 const core::Tensor &Tj_qs,
+                                 const core::Tensor &normal_ps,
+                                 const core::Tensor &Ri_normal_ps,
+                                 const core::Tensor &RjT_Ri_normal_ps,
+                                 const core::Tensor &cgrid_idx_ps,
+                                 const core::Tensor &cgrid_idx_qs,
+                                 const core::Tensor &cgrid_ratio_qs,
+                                 const core::Tensor &cgrid_ratio_ps,
+                                 int i,
+                                 int j,
+                                 int n,
+                                 float threshold);
+
+void FillInSLACRegularizerTermSYCL(core::Tensor &AtA,
+                                   core::Tensor &Atb,
+                                   core::Tensor &residual,
+                                   const core::Tensor &grid_idx,
+                                   const core::Tensor &grid_nbs_idx,
+                                   const core::Tensor &grid_nbs_mask,
+                                   const core::Tensor &positions_init,
+                                   const core::Tensor &positions_curr,
+                                   float weight,
+                                   int n,
+                                   int anchor_idx);
+
 #endif
 
 }  // namespace kernel
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
index e158809f6a4..21732102836 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
@@ -13,6 +13,7 @@ namespace open3d {
 namespace t {
 namespace pipelines {
 namespace kernel {
+#ifndef OPEN3D_SKIP_FILL_IN_LS_MAIN
 #if defined(__CUDACC__)
 void FillInRigidAlignmentTermCUDA
 #else
@@ -471,6 +472,7 @@ void FillInSLACRegularizerTermCPU
                 }
             });
 }
+#endif  // OPEN3D_SKIP_FILL_IN_LS_MAIN
 }  // namespace kernel
 }  // namespace pipelines
 }  // namespace t
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
new file mode 100644
index 00000000000..a43767ba40e
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemSYCL.cpp
@@ -0,0 +1,462 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+// Skip the CPU/CUDA main function definitions; include only helper includes.
+#define OPEN3D_SKIP_FILL_IN_LS_MAIN
+#include "open3d/t/pipelines/kernel/FillInLinearSystemImpl.h"
+#undef OPEN3D_SKIP_FILL_IN_LS_MAIN
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/t/pipelines/kernel/FillInLinearSystem.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+
+void FillInRigidAlignmentTermSYCL(core::Tensor &AtA,
+                                  core::Tensor &Atb,
+                                  core::Tensor &residual,
+                                  const core::Tensor &Ti_ps,
+                                  const core::Tensor &Tj_qs,
+                                  const core::Tensor &Ri_normal_ps,
+                                  int i,
+                                  int j,
+                                  float threshold) {
+    core::Device device = AtA.GetDevice();
+    int64_t n = Ti_ps.GetLength();
+    if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
+        utility::LogError(
+                "Unable to setup linear system: input length mismatch.");
+    }
+
+    // First fill in a small 12 x 12 linear system using group reduction.
+    // kLocalDim = 12*12 + 12 + 1 = 157 elements.
+    static constexpr int kLocalDim12 = 144;  // 12*12
+    static constexpr int kLocalDimAtb = 12;
+    static constexpr int kLocalDimTotal = kLocalDim12 + kLocalDimAtb + 1;
+
+    core::Tensor AtA_local =
+            core::Tensor::Zeros({12, 12}, core::Float32, device);
+    core::Tensor Atb_local = core::Tensor::Zeros({12}, core::Float32, device);
+
+    float *AtA_local_ptr = static_cast<float *>(AtA_local.GetDataPtr());
+    float *Atb_local_ptr = static_cast<float *>(Atb_local.GetDataPtr());
+    float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
+
+    const float *Ti_ps_ptr = static_cast<const float *>(Ti_ps.GetDataPtr());
+    const float *Tj_qs_ptr = static_cast<const float *>(Tj_qs.GetDataPtr());
+    const float *Ri_normal_ps_ptr =
+            static_cast<const float *>(Ri_normal_ps.GetDataPtr());
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         // Private accumulation buffer: [AtA_local(144) |
+                         // Atb_local(12) | residual(1)]
+                         float local_sum[kLocalDimTotal] = {};
+
+                         if (gid < n) {
+                             const float *p_prime = Ti_ps_ptr + 3 * gid;
+                             const float *q_prime = Tj_qs_ptr + 3 * gid;
+                             const float *normal_p_prime =
+                                     Ri_normal_ps_ptr + 3 * gid;
+
+                             float r = (p_prime[0] - q_prime[0]) *
+                                               normal_p_prime[0] +
+                                       (p_prime[1] - q_prime[1]) *
+                                               normal_p_prime[1] +
+                                       (p_prime[2] - q_prime[2]) *
+                                               normal_p_prime[2];
+
+                             if (sycl::fabs(r) <= threshold) {
+                                 float J_ij[12];
+                                 J_ij[0] = -q_prime[2] * normal_p_prime[1] +
+                                           q_prime[1] * normal_p_prime[2];
+                                 J_ij[1] = q_prime[2] * normal_p_prime[0] -
+                                           q_prime[0] * normal_p_prime[2];
+                                 J_ij[2] = -q_prime[1] * normal_p_prime[0] +
+                                           q_prime[0] * normal_p_prime[1];
+                                 J_ij[3] = normal_p_prime[0];
+                                 J_ij[4] = normal_p_prime[1];
+                                 J_ij[5] = normal_p_prime[2];
+                                 for (int k = 0; k < 6; ++k) {
+                                     J_ij[k + 6] = -J_ij[k];
+                                 }
+
+                                 for (int i_local = 0; i_local < 12;
+                                      ++i_local) {
+                                     for (int j_local = 0; j_local < 12;
+                                          ++j_local) {
+                                         local_sum[i_local * 12 + j_local] +=
+                                                 J_ij[i_local] * J_ij[j_local];
+                                     }
+                                     local_sum[kLocalDim12 + i_local] +=
+                                             J_ij[i_local] * r;
+                                 }
+                                 local_sum[kLocalDim12 + kLocalDimAtb] += r * r;
+                             }
+                         }
+
+                         // Group-reduce each element and atomically add to
+                         // the device-side local system buffers.
+                         auto grp = item.get_group();
+                         for (int k = 0; k < kLocalDim12; ++k) {
+                             float gv = sycl::reduce_over_group(
+                                     grp, local_sum[k], sycl::plus<float>{});
+                             if (item.get_local_id(0) == 0) {
+                                 sycl::atomic_ref<float,
+                                                  sycl::memory_order::acq_rel,
+                                                  sycl::memory_scope::device>(
+                                         AtA_local_ptr[k]) += gv;
+                             }
+                         }
+                         for (int k = 0; k < kLocalDimAtb; ++k) {
+                             float gv = sycl::reduce_over_group(
+                                     grp, local_sum[kLocalDim12 + k],
+                                     sycl::plus<float>{});
+                             if (item.get_local_id(0) == 0) {
+                                 sycl::atomic_ref<float,
+                                                  sycl::memory_order::acq_rel,
+                                                  sycl::memory_scope::device>(
+                                         Atb_local_ptr[k]) += gv;
+                             }
+                         }
+                         {
+                             float gv = sycl::reduce_over_group(
+                                     grp, local_sum[kLocalDim12 + kLocalDimAtb],
+                                     sycl::plus<float>{});
+                             if (item.get_local_id(0) == 0) {
+                                 sycl::atomic_ref<float,
+                                                  sycl::memory_order::acq_rel,
+                                                  sycl::memory_scope::device>(
+                                         *residual_ptr) += gv;
+                             }
+                         }
+                     });
+         }).wait_and_throw();
+
+    // Then fill-in the large linear system.
+    std::vector<int64_t> indices_vec(12);
+    for (int k = 0; k < 6; ++k) {
+        indices_vec[k] = i * 6 + k;
+        indices_vec[k + 6] = j * 6 + k;
+    }
+
+    std::vector<int64_t> indices_i_vec;
+    std::vector<int64_t> indices_j_vec;
+    for (int local_i = 0; local_i < 12; ++local_i) {
+        for (int local_j = 0; local_j < 12; ++local_j) {
+            indices_i_vec.push_back(indices_vec[local_i]);
+            indices_j_vec.push_back(indices_vec[local_j]);
+        }
+    }
+
+    core::Tensor indices(indices_vec, {12}, core::Int64, device);
+    core::Tensor indices_i(indices_i_vec, {12 * 12}, core::Int64, device);
+    core::Tensor indices_j(indices_j_vec, {12 * 12}, core::Int64, device);
+
+    core::Tensor AtA_sub = AtA.IndexGet({indices_i, indices_j});
+    AtA.IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.View({12 * 12}));
+
+    core::Tensor Atb_sub = Atb.IndexGet({indices});
+    Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1}));
+}
+
+void FillInSLACAlignmentTermSYCL(core::Tensor &AtA,
+                                 core::Tensor &Atb,
+                                 core::Tensor &residual,
+                                 const core::Tensor &Ti_Cps,
+                                 const core::Tensor &Tj_Cqs,
+                                 const core::Tensor &Cnormal_ps,
+                                 const core::Tensor &Ri_Cnormal_ps,
+                                 const core::Tensor &RjT_Ri_Cnormal_ps,
+                                 const core::Tensor &cgrid_idx_ps,
+                                 const core::Tensor &cgrid_idx_qs,
+                                 const core::Tensor &cgrid_ratio_qs,
+                                 const core::Tensor &cgrid_ratio_ps,
+                                 int i,
+                                 int j,
+                                 int n_frags,
+                                 float threshold) {
+    int64_t n = Ti_Cps.GetLength();
+    if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
+        Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n ||
+        cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
+        cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
+        utility::LogError(
+                "Unable to setup linear system: input length mismatch.");
+    }
+
+    core::Device device = AtA.GetDevice();
+    int n_vars = Atb.GetLength();
+    float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
+    float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
+    float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
+
+    const float *Ti_Cps_ptr = static_cast<const float *>(Ti_Cps.GetDataPtr());
+    const float *Tj_Cqs_ptr = static_cast<const float *>(Tj_Cqs.GetDataPtr());
+    const float *Cnormal_ps_ptr =
+            static_cast<const float *>(Cnormal_ps.GetDataPtr());
+    const float *Ri_Cnormal_ps_ptr =
+            static_cast<const float *>(Ri_Cnormal_ps.GetDataPtr());
+    const float *RjT_Ri_Cnormal_ps_ptr =
+            static_cast<const float *>(RjT_Ri_Cnormal_ps.GetDataPtr());
+
+    const int *cgrid_idx_ps_ptr =
+            static_cast<const int *>(cgrid_idx_ps.GetDataPtr());
+    const int *cgrid_idx_qs_ptr =
+            static_cast<const int *>(cgrid_idx_qs.GetDataPtr());
+    const float *cgrid_ratio_ps_ptr =
+            static_cast<const float *>(cgrid_ratio_ps.GetDataPtr());
+    const float *cgrid_ratio_qs_ptr =
+            static_cast<const float *>(cgrid_ratio_qs.GetDataPtr());
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             int64_t workload_idx = id[0];
+             const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
+             const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
+             const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
+             const float *Ri_Cnormal_p = Ri_Cnormal_ps_ptr + 3 * workload_idx;
+             const float *RjTRi_Cnormal_p =
+                     RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
+
+             const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx;
+             const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx;
+             const float *cgrid_ratio_p = cgrid_ratio_ps_ptr + 8 * workload_idx;
+             const float *cgrid_ratio_q = cgrid_ratio_qs_ptr + 8 * workload_idx;
+
+             float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
+                       (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
+                       (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
+             if (sycl::fabs(r) > threshold) return;
+
+             float J[60];
+             int idx[60];
+
+             J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2];
+             J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2];
+             J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1];
+             J[3] = Ri_Cnormal_p[0];
+             J[4] = Ri_Cnormal_p[1];
+             J[5] = Ri_Cnormal_p[2];
+
+             for (int k = 0; k < 6; ++k) {
+                 J[k + 6] = -J[k];
+                 idx[k + 0] = 6 * i + k;
+                 idx[k + 6] = 6 * j + k;
+             }
+
+             for (int k = 0; k < 8; ++k) {
+                 J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0];
+                 J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1];
+                 J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2];
+                 idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
+                 idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
+                 idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
+             }
+
+             for (int k = 0; k < 8; ++k) {
+                 J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
+                 J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
+                 J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
+                 idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
+                 idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
+                 idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
+             }
+
+             for (int ki = 0; ki < 60; ++ki) {
+                 for (int kj = 0; kj < 60; ++kj) {
+                     float AtA_ij = J[ki] * J[kj];
+                     int ij = idx[ki] * n_vars + idx[kj];
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[ij]) += AtA_ij;
+                 }
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(
+                         Atb_ptr[idx[ki]]) += J[ki] * r;
+             }
+             sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                              sycl::memory_scope::device>(*residual_ptr) +=
+                     r * r;
+         }).wait_and_throw();
+}
+
+void FillInSLACRegularizerTermSYCL(core::Tensor &AtA,
+                                   core::Tensor &Atb,
+                                   core::Tensor &residual,
+                                   const core::Tensor &grid_idx,
+                                   const core::Tensor &grid_nbs_idx,
+                                   const core::Tensor &grid_nbs_mask,
+                                   const core::Tensor &positions_init,
+                                   const core::Tensor &positions_curr,
+                                   float weight,
+                                   int n_frags,
+                                   int anchor_idx) {
+    int64_t n = grid_idx.GetLength();
+    int64_t n_vars = Atb.GetLength();
+
+    core::Device device = AtA.GetDevice();
+    float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
+    float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
+    float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
+
+    const int *grid_idx_ptr = static_cast<const int *>(grid_idx.GetDataPtr());
+    const int *grid_nbs_idx_ptr =
+            static_cast<const int *>(grid_nbs_idx.GetDataPtr());
+    const bool *grid_nbs_mask_ptr =
+            static_cast<const bool *>(grid_nbs_mask.GetDataPtr());
+    const float *positions_init_ptr =
+            static_cast<const float *>(positions_init.GetDataPtr());
+    const float *positions_curr_ptr =
+            static_cast<const float *>(positions_curr.GetDataPtr());
+
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.parallel_for(sycl::range<1>{(size_t)n}, [=](sycl::id<1> id) {
+             int64_t workload_idx = id[0];
+             int idx_i = grid_idx_ptr[workload_idx];
+
+             const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx;
+             const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx;
+
+             float cov[3][3] = {{0}};
+             float U[3][3], V[3][3], S[3];
+
+             int cnt = 0;
+             for (int k = 0; k < 6; ++k) {
+                 if (!mask_nbs[k]) continue;
+                 int idx_k = idx_nbs[k];
+
+                 float diff_ik_init[3] = {
+                         positions_init_ptr[idx_i * 3 + 0] -
+                                 positions_init_ptr[idx_k * 3 + 0],
+                         positions_init_ptr[idx_i * 3 + 1] -
+                                 positions_init_ptr[idx_k * 3 + 1],
+                         positions_init_ptr[idx_i * 3 + 2] -
+                                 positions_init_ptr[idx_k * 3 + 2]};
+                 float diff_ik_curr[3] = {
+                         positions_curr_ptr[idx_i * 3 + 0] -
+                                 positions_curr_ptr[idx_k * 3 + 0],
+                         positions_curr_ptr[idx_i * 3 + 1] -
+                                 positions_curr_ptr[idx_k * 3 + 1],
+                         positions_curr_ptr[idx_i * 3 + 2] -
+                                 positions_curr_ptr[idx_k * 3 + 2]};
+
+                 for (int ii = 0; ii < 3; ++ii) {
+                     for (int jj = 0; jj < 3; ++jj) {
+                         cov[ii][jj] += diff_ik_init[ii] * diff_ik_curr[jj];
+                     }
+                 }
+                 ++cnt;
+             }
+
+             if (cnt < 3) return;
+
+             core::linalg::kernel::svd3x3(*cov, *U, S, *V);
+
+             float R[3][3];
+             core::linalg::kernel::transpose3x3_(*U);
+             core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
+
+             float d = core::linalg::kernel::det3x3(*R);
+             if (d < 0) {
+                 U[2][0] = -U[2][0];
+                 U[2][1] = -U[2][1];
+                 U[2][2] = -U[2][2];
+                 core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
+             }
+
+             if (idx_i == anchor_idx) {
+                 R[0][0] = R[1][1] = R[2][2] = 1;
+                 R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] = 0;
+             }
+
+             for (int k = 0; k < 6; ++k) {
+                 if (!mask_nbs[k]) continue;
+                 int idx_k = idx_nbs[k];
+
+                 float diff_ik_init[3] = {
+                         positions_init_ptr[idx_i * 3 + 0] -
+                                 positions_init_ptr[idx_k * 3 + 0],
+                         positions_init_ptr[idx_i * 3 + 1] -
+                                 positions_init_ptr[idx_k * 3 + 1],
+                         positions_init_ptr[idx_i * 3 + 2] -
+                                 positions_init_ptr[idx_k * 3 + 2]};
+                 float diff_ik_curr[3] = {
+                         positions_curr_ptr[idx_i * 3 + 0] -
+                                 positions_curr_ptr[idx_k * 3 + 0],
+                         positions_curr_ptr[idx_i * 3 + 1] -
+                                 positions_curr_ptr[idx_k * 3 + 1],
+                         positions_curr_ptr[idx_i * 3 + 2] -
+                                 positions_curr_ptr[idx_k * 3 + 2]};
+                 float R_diff_ik_curr[3];
+                 core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init,
+                                                     R_diff_ik_curr);
+
+                 float local_r[3];
+                 local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
+                 local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
+                 local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
+
+                 int offset_idx_i = 3 * idx_i + 6 * n_frags;
+                 int offset_idx_k = 3 * idx_k + 6 * n_frags;
+
+                 sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(*residual_ptr) +=
+                         weight *
+                         (local_r[0] * local_r[0] + local_r[1] * local_r[1] +
+                          local_r[2] * local_r[2]);
+
+                 for (int axis = 0; axis < 3; ++axis) {
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[(offset_idx_i + axis) * n_vars +
+                                     offset_idx_i + axis]) += weight;
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[(offset_idx_k + axis) * n_vars +
+                                     offset_idx_k + axis]) += weight;
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[(offset_idx_i + axis) * n_vars +
+                                     offset_idx_k + axis]) -= weight;
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             AtA_ptr[(offset_idx_k + axis) * n_vars +
+                                     offset_idx_i + axis]) -= weight;
+
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             Atb_ptr[offset_idx_i + axis]) +=
+                             weight * local_r[axis];
+                     sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                                      sycl::memory_scope::device>(
+                             Atb_ptr[offset_idx_k + axis]) -=
+                             weight * local_r[axis];
+                 }
+             }
+         }).wait_and_throw();
+}
+
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
index ac61daeaa2c..4a9afaff45f 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
@@ -58,6 +58,15 @@ void ComputeOdometryResultPointToPlane(
                   target_vertex_map, target_normal_map, intrinsics_d, trans_d,
                   delta, inlier_residual, inlier_count, depth_outlier_trunc,
                   depth_huber_delta);
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeOdometryResultPointToPlaneSYCL(
+                source_vertex_map, target_vertex_map, target_normal_map,
+                intrinsics_d, trans_d, delta, inlier_residual, inlier_count,
+                depth_outlier_trunc, depth_huber_delta);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -118,6 +127,16 @@ void ComputeOdometryResultIntensity(const core::Tensor &source_depth,
                   target_intensity_dx, target_intensity_dy, source_vertex_map,
                   intrinsics_d, trans_d, delta, inlier_residual, inlier_count,
                   depth_outlier_trunc, intensity_huber_delta);
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeOdometryResultIntensitySYCL(
+                source_depth, target_depth, source_intensity, target_intensity,
+                target_intensity_dx, target_intensity_dy, source_vertex_map,
+                intrinsics_d, trans_d, delta, inlier_residual, inlier_count,
+                depth_outlier_trunc, intensity_huber_delta);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -187,6 +206,17 @@ void ComputeOdometryResultHybrid(const core::Tensor &source_depth,
                   source_vertex_map, intrinsics_d, trans_d, delta,
                   inlier_residual, inlier_count, depth_outlier_trunc,
                   depth_huber_delta, intensity_huber_delta);
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeOdometryResultHybridSYCL(
+                source_depth, target_depth, source_intensity, target_intensity,
+                target_depth_dx, target_depth_dy, target_intensity_dx,
+                target_intensity_dy, source_vertex_map, intrinsics_d, trans_d,
+                delta, inlier_residual, inlier_count, depth_outlier_trunc,
+                depth_huber_delta, intensity_huber_delta);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -222,6 +252,14 @@ void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map,
         CUDA_CALL(ComputeOdometryInformationMatrixCUDA, source_vertex_map,
                   target_vertex_map, intrinsic, source_to_target,
                   square_dist_thr, information);
+    } else if (device.GetType() == core::Device::DeviceType::SYCL) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeOdometryInformationMatrixSYCL(
+                source_vertex_map, target_vertex_map, intrinsic,
+                source_to_target, square_dist_thr, information);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
index d1aacbbeac3..3c65eec81e8 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryImpl.h
@@ -125,6 +125,61 @@ void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_depth,
                                           core::Tensor& information);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void ComputeOdometryResultPointToPlaneSYCL(
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& target_vertex_map,
+        const core::Tensor& target_normal_map,
+        const core::Tensor& intrinsics,
+        const core::Tensor& init_source_to_target,
+        core::Tensor& delta,
+        float& inlier_residual,
+        int& inlier_count,
+        const float depth_outlier_trunc,
+        const float depth_huber_delta);
+
+void ComputeOdometryResultIntensitySYCL(
+        const core::Tensor& source_depth,
+        const core::Tensor& target_depth,
+        const core::Tensor& source_intensity,
+        const core::Tensor& target_intensity,
+        const core::Tensor& target_intensity_dx,
+        const core::Tensor& target_intensity_dy,
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& intrinsics,
+        const core::Tensor& init_source_to_target,
+        core::Tensor& delta,
+        float& inlier_residual,
+        int& inlier_count,
+        const float depth_outlier_trunc,
+        const float intensity_huber_delta);
+
+void ComputeOdometryResultHybridSYCL(const core::Tensor& source_depth,
+                                     const core::Tensor& target_depth,
+                                     const core::Tensor& source_intensity,
+                                     const core::Tensor& target_intensity,
+                                     const core::Tensor& target_depth_dx,
+                                     const core::Tensor& target_depth_dy,
+                                     const core::Tensor& target_intensity_dx,
+                                     const core::Tensor& target_intensity_dy,
+                                     const core::Tensor& source_vertex_map,
+                                     const core::Tensor& intrinsics,
+                                     const core::Tensor& init_source_to_target,
+                                     core::Tensor& delta,
+                                     float& inlier_residual,
+                                     int& inlier_count,
+                                     const float depth_outlier_trunc,
+                                     const float depth_huber_delta,
+                                     const float intensity_huber_delta);
+
+void ComputeOdometryInformationMatrixSYCL(const core::Tensor& source_depth,
+                                          const core::Tensor& target_depth,
+                                          const core::Tensor& intrinsic,
+                                          const core::Tensor& source_to_target,
+                                          const float square_dist_thr,
+                                          core::Tensor& information);
+#endif
+
 }  // namespace odometry
 }  // namespace kernel
 }  // namespace pipelines
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
new file mode 100644
index 00000000000..c27b3159609
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometrySYCL.cpp
@@ -0,0 +1,398 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/t/geometry/kernel/GeometryIndexer.h"
+#include "open3d/t/geometry/kernel/GeometryMacros.h"
+#include "open3d/t/pipelines/kernel/RGBDOdometryImpl.h"
+#include "open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h"
+#include "open3d/t/pipelines/kernel/TransformationConverter.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+namespace odometry {
+
+using t::geometry::kernel::NDArrayIndexer;
+using t::geometry::kernel::TransformIndexer;
+
+static constexpr int kReduceDimOdometry = 29;
+static constexpr int kJtJDimOdometry = 21;
+
+/// Perform a group reduction over a fixed-size private float array using
+/// sycl::reduce_over_group, then have work item 0 atomically accumulate each
+/// group partial sum into the device-side global buffer.
+template <int N>
+static inline void GroupReduceAndAdd(sycl::nd_item<1> item,
+                                     const float (&local_sum)[N],
+                                     float *global_sum_ptr) {
+    auto grp = item.get_group();
+    for (int k = 0; k < N; ++k) {
+        float grp_val =
+                sycl::reduce_over_group(grp, local_sum[k], sycl::plus<float>{});
+        if (item.get_local_id(0) == 0) {
+            sycl::atomic_ref<float, sycl::memory_order::acq_rel,
+                             sycl::memory_scope::device>(global_sum_ptr[k]) +=
+                    grp_val;
+        }
+    }
+}
+
+void ComputeOdometryResultPointToPlaneSYCL(
+        const core::Tensor &source_vertex_map,
+        const core::Tensor &target_vertex_map,
+        const core::Tensor &target_normal_map,
+        const core::Tensor &intrinsics,
+        const core::Tensor &init_source_to_target,
+        core::Tensor &delta,
+        float &inlier_residual,
+        int &inlier_count,
+        const float depth_outlier_trunc,
+        const float depth_huber_delta) {
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+    NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
+    NDArrayIndexer target_normal_indexer(target_normal_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = init_source_to_target;
+    TransformIndexer ti(intrinsics, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+    const int64_t n = rows * cols;
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
+    float *global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         float local_sum[kReduceDimOdometry] = {};
+
+                         if (gid < n) {
+                             const int y = gid / cols;
+                             const int x = gid % cols;
+
+                             float J[6] = {0};
+                             float r = 0;
+                             const bool valid = GetJacobianPointToPlane(
+                                     x, y, depth_outlier_trunc,
+                                     source_vertex_indexer,
+                                     target_vertex_indexer,
+                                     target_normal_indexer, ti, J, r);
+
+                             if (valid) {
+                                 const float d_huber =
+                                         HuberDeriv(r, depth_huber_delta);
+                                 const float r_huber =
+                                         HuberLoss(r, depth_huber_delta);
+                                 int offset = 0;
+                                 for (int i = 0; i < 6; ++i) {
+                                     for (int j = 0; j <= i; ++j) {
+                                         local_sum[offset++] += J[i] * J[j];
+                                     }
+                                     local_sum[21 + i] += J[i] * d_huber;
+                                 }
+                                 local_sum[27] += r_huber;
+                                 local_sum[28] += 1.0f;
+                             }
+                         }
+
+                         GroupReduceAndAdd<kReduceDimOdometry>(item, local_sum,
+                                                               global_sum_ptr);
+                     });
+         }).wait_and_throw();
+
+    DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
+}
+
+void ComputeOdometryResultIntensitySYCL(
+        const core::Tensor &source_depth,
+        const core::Tensor &target_depth,
+        const core::Tensor &source_intensity,
+        const core::Tensor &target_intensity,
+        const core::Tensor &target_intensity_dx,
+        const core::Tensor &target_intensity_dy,
+        const core::Tensor &source_vertex_map,
+        const core::Tensor &intrinsics,
+        const core::Tensor &init_source_to_target,
+        core::Tensor &delta,
+        float &inlier_residual,
+        int &inlier_count,
+        const float depth_outlier_trunc,
+        const float intensity_huber_delta) {
+    NDArrayIndexer source_depth_indexer(source_depth, 2);
+    NDArrayIndexer target_depth_indexer(target_depth, 2);
+    NDArrayIndexer source_intensity_indexer(source_intensity, 2);
+    NDArrayIndexer target_intensity_indexer(target_intensity, 2);
+    NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2);
+    NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2);
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = init_source_to_target;
+    TransformIndexer ti(intrinsics, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+    const int64_t n = rows * cols;
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
+    float *global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         float local_sum[kReduceDimOdometry] = {};
+
+                         if (gid < n) {
+                             const int y = gid / cols;
+                             const int x = gid % cols;
+
+                             float J_I[6] = {0};
+                             float r_I = 0;
+                             const bool valid = GetJacobianIntensity(
+                                     x, y, depth_outlier_trunc,
+                                     source_depth_indexer, target_depth_indexer,
+                                     source_intensity_indexer,
+                                     target_intensity_indexer,
+                                     target_intensity_dx_indexer,
+                                     target_intensity_dy_indexer,
+                                     source_vertex_indexer, ti, J_I, r_I);
+
+                             if (valid) {
+                                 const float d_huber =
+                                         HuberDeriv(r_I, intensity_huber_delta);
+                                 const float r_huber =
+                                         HuberLoss(r_I, intensity_huber_delta);
+                                 int offset = 0;
+                                 for (int i = 0; i < 6; ++i) {
+                                     for (int j = 0; j <= i; ++j) {
+                                         local_sum[offset++] += J_I[i] * J_I[j];
+                                     }
+                                     local_sum[21 + i] += J_I[i] * d_huber;
+                                 }
+                                 local_sum[27] += r_huber;
+                                 local_sum[28] += 1.0f;
+                             }
+                         }
+
+                         GroupReduceAndAdd<kReduceDimOdometry>(item, local_sum,
+                                                               global_sum_ptr);
+                     });
+         }).wait_and_throw();
+
+    DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
+}
+
+void ComputeOdometryResultHybridSYCL(const core::Tensor &source_depth,
+                                     const core::Tensor &target_depth,
+                                     const core::Tensor &source_intensity,
+                                     const core::Tensor &target_intensity,
+                                     const core::Tensor &target_depth_dx,
+                                     const core::Tensor &target_depth_dy,
+                                     const core::Tensor &target_intensity_dx,
+                                     const core::Tensor &target_intensity_dy,
+                                     const core::Tensor &source_vertex_map,
+                                     const core::Tensor &intrinsics,
+                                     const core::Tensor &init_source_to_target,
+                                     core::Tensor &delta,
+                                     float &inlier_residual,
+                                     int &inlier_count,
+                                     const float depth_outlier_trunc,
+                                     const float depth_huber_delta,
+                                     const float intensity_huber_delta) {
+    NDArrayIndexer source_depth_indexer(source_depth, 2);
+    NDArrayIndexer target_depth_indexer(target_depth, 2);
+    NDArrayIndexer source_intensity_indexer(source_intensity, 2);
+    NDArrayIndexer target_intensity_indexer(target_intensity, 2);
+    NDArrayIndexer target_depth_dx_indexer(target_depth_dx, 2);
+    NDArrayIndexer target_depth_dy_indexer(target_depth_dy, 2);
+    NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2);
+    NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2);
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = init_source_to_target;
+    TransformIndexer ti(intrinsics, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+    const int64_t n = rows * cols;
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kReduceDimOdometry}, core::Float32, device);
+    float *global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         float local_sum[kReduceDimOdometry] = {};
+
+                         if (gid < n) {
+                             const int y = gid / cols;
+                             const int x = gid % cols;
+
+                             float J_I[6] = {0}, J_D[6] = {0};
+                             float r_I = 0, r_D = 0;
+                             const bool valid = GetJacobianHybrid(
+                                     x, y, depth_outlier_trunc,
+                                     source_depth_indexer, target_depth_indexer,
+                                     source_intensity_indexer,
+                                     target_intensity_indexer,
+                                     target_depth_dx_indexer,
+                                     target_depth_dy_indexer,
+                                     target_intensity_dx_indexer,
+                                     target_intensity_dy_indexer,
+                                     source_vertex_indexer, ti, J_I, J_D, r_I,
+                                     r_D);
+
+                             if (valid) {
+                                 int offset = 0;
+                                 for (int i = 0; i < 6; ++i) {
+                                     for (int j = 0; j <= i; ++j) {
+                                         local_sum[offset++] +=
+                                                 J_I[i] * J_I[j] +
+                                                 J_D[i] * J_D[j];
+                                     }
+                                     local_sum[21 + i] +=
+                                             J_I[i] *
+                                                     HuberDeriv(
+                                                             r_I,
+                                                             intensity_huber_delta) +
+                                             J_D[i] *
+                                                     HuberDeriv(
+                                                             r_D,
+                                                             depth_huber_delta);
+                                 }
+                                 local_sum[27] +=
+                                         HuberLoss(r_I, intensity_huber_delta) +
+                                         HuberLoss(r_D, depth_huber_delta);
+                                 local_sum[28] += 1.0f;
+                             }
+                         }
+
+                         GroupReduceAndAdd<kReduceDimOdometry>(item, local_sum,
+                                                               global_sum_ptr);
+                     });
+         }).wait_and_throw();
+
+    DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
+}
+
+void ComputeOdometryInformationMatrixSYCL(const core::Tensor &source_vertex_map,
+                                          const core::Tensor &target_vertex_map,
+                                          const core::Tensor &intrinsic,
+                                          const core::Tensor &source_to_target,
+                                          const float square_dist_thr,
+                                          core::Tensor &information) {
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+    NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = source_to_target;
+    TransformIndexer ti(intrinsic, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+    const int64_t n = rows * cols;
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kJtJDimOdometry}, core::Float32, device);
+    float *global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    queue.submit([&](sycl::handler &cgh) {
+             cgh.parallel_for(
+                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                     [=](sycl::nd_item<1> item) {
+                         const int gid = item.get_global_id(0);
+                         float local_sum[kJtJDimOdometry] = {};
+
+                         if (gid < n) {
+                             const int y = gid / cols;
+                             const int x = gid % cols;
+
+                             float J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
+                             float rx = 0, ry = 0, rz = 0;
+                             const bool valid = GetJacobianPointToPoint(
+                                     x, y, square_dist_thr,
+                                     source_vertex_indexer,
+                                     target_vertex_indexer, ti, J_x, J_y, J_z,
+                                     rx, ry, rz);
+
+                             if (valid) {
+                                 int offset = 0;
+                                 for (int i = 0; i < 6; ++i) {
+                                     for (int j = 0; j <= i; ++j) {
+                                         local_sum[offset++] +=
+                                                 J_x[i] * J_x[j] +
+                                                 J_y[i] * J_y[j] +
+                                                 J_z[i] * J_z[j];
+                                     }
+                                 }
+                             }
+                         }
+
+                         GroupReduceAndAdd<kJtJDimOdometry>(item, local_sum,
+                                                            global_sum_ptr);
+                     });
+         }).wait_and_throw();
+
+    const core::Device host(core::Device("CPU:0"));
+    information = core::Tensor::Empty({6, 6}, core::Float64, host);
+    global_sum = global_sum.To(host, core::Float64);
+
+    double *info_ptr = information.GetDataPtr<double>();
+    double *reduction_ptr = global_sum.GetDataPtr<double>();
+    for (int j = 0; j < 6; j++) {
+        const int64_t reduction_idx = ((j * (j + 1)) / 2);
+        for (int k = 0; k <= j; k++) {
+            info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k];
+            info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k];
+        }
+    }
+}
+
+}  // namespace odometry
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/Registration.cpp b/cpp/open3d/t/pipelines/kernel/Registration.cpp
index 326671431f3..95ccddec5c2 100644
--- a/cpp/open3d/t/pipelines/kernel/Registration.cpp
+++ b/cpp/open3d/t/pipelines/kernel/Registration.cpp
@@ -41,6 +41,16 @@ core::Tensor ComputePosePointToPlane(const core::Tensor &source_points,
                   target_points.Contiguous(), target_normals.Contiguous(),
                   correspondence_indices.Contiguous(), pose, residual,
                   inlier_count, source_points.GetDtype(), device, kernel);
+    } else if (source_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputePosePointToPlaneSYCL(
+                source_points.Contiguous(), target_points.Contiguous(),
+                target_normals.Contiguous(),
+                correspondence_indices.Contiguous(), pose, residual,
+                inlier_count, source_points.GetDtype(), device, kernel);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -85,6 +95,18 @@ core::Tensor ComputePoseColoredICP(const core::Tensor &source_points,
                   correspondence_indices.Contiguous(), pose, residual,
                   inlier_count, source_points.GetDtype(), device, kernel,
                   lambda_geometric);
+    } else if (source_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputePoseColoredICPSYCL(
+                source_points.Contiguous(), source_colors.Contiguous(),
+                target_points.Contiguous(), target_normals.Contiguous(),
+                target_colors.Contiguous(), target_color_gradients.Contiguous(),
+                correspondence_indices.Contiguous(), pose, residual,
+                inlier_count, source_points.GetDtype(), device, kernel,
+                lambda_geometric);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -191,6 +213,21 @@ core::Tensor ComputePoseDopplerICP(
                   v_v_in_V.Contiguous(), period, reject_outliers,
                   doppler_outlier_threshold, kernel_geometric, kernel_doppler,
                   lambda_doppler);
+    } else if (device_type == core::Device::DeviceType::SYCL) {
+#ifdef BUILD_SYCL_MODULE
+        ComputePoseDopplerICPSYCL(
+                source_points.Contiguous(), source_dopplers.Contiguous(),
+                source_directions.Contiguous(), target_points.Contiguous(),
+                target_normals.Contiguous(),
+                correspondence_indices.Contiguous(), output_pose, residual,
+                inlier_count, dtype, device, R_S_to_V.Contiguous(),
+                r_v_to_s_in_V.Contiguous(), w_v_in_V.Contiguous(),
+                v_v_in_V.Contiguous(), period, reject_outliers,
+                doppler_outlier_threshold, kernel_geometric, kernel_doppler,
+                lambda_doppler);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
@@ -270,6 +307,51 @@ std::tuple<core::Tensor, core::Tensor> ComputeRtPointToPoint(
         t = mean_t.Reshape({-1}) - R.Matmul(mean_s.T()).Reshape({-1});
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (source_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        // Use tensor operations which work on SYCL devices.
+        core::Tensor valid = correspondence_indices.Ne(-1).Reshape({-1});
+
+        if (valid.GetLength() == 0) {
+            utility::LogError("No valid correspondence present.");
+        }
+
+        core::Tensor source_indices =
+                core::Tensor::Arange(0, source_points.GetShape()[0], 1,
+                                     core::Int64, device)
+                        .IndexGet({valid});
+        core::Tensor target_indices =
+                correspondence_indices.IndexGet({valid}).Reshape({-1});
+
+        inlier_count = source_indices.GetLength();
+
+        core::Tensor source_select = source_points.IndexGet({source_indices});
+        core::Tensor target_select = target_points.IndexGet({target_indices});
+
+        core::Tensor mean_s = source_select.Mean({0}, true);
+        core::Tensor mean_t = target_select.Mean({0}, true);
+
+        core::Device host("CPU:0");
+        core::Tensor Sxy = (target_select - mean_t)
+                                   .T()
+                                   .Matmul(source_select - mean_s)
+                                   .Div_(static_cast<float>(inlier_count))
+                                   .To(host, core::Float64);
+
+        mean_s = mean_s.To(host, core::Float64);
+        mean_t = mean_t.To(host, core::Float64);
+
+        core::Tensor U, D, VT;
+        std::tie(U, D, VT) = Sxy.SVD();
+        core::Tensor S = core::Tensor::Eye(3, core::Float64, host);
+        if (U.Det() * (VT.T()).Det() < 0) {
+            S[-1][-1] = -1;
+        }
+        R = U.Matmul(S.Matmul(VT));
+        t = mean_t.Reshape({-1}) - R.Matmul(mean_s.T()).Reshape({-1});
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device.");
@@ -294,6 +376,14 @@ core::Tensor ComputeInformationMatrix(
         CUDA_CALL(ComputeInformationMatrixCUDA, target_points.Contiguous(),
                   correspondence_indices.Contiguous(), information_matrix,
                   target_points.GetDtype(), device);
+    } else if (target_points.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ComputeInformationMatrixSYCL(
+                target_points.Contiguous(), correspondence_indices.Contiguous(),
+                information_matrix, target_points.GetDtype(), device);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else {
         utility::LogError("Unimplemented device.");
     }
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
index 3f08ca4c811..94acdf773b3 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
@@ -149,6 +149,63 @@ void ComputeInformationMatrixCUDA(const core::Tensor &target_points,
                                   const core::Device &device);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
+                                 const core::Tensor &target_points,
+                                 const core::Tensor &target_normals,
+                                 const core::Tensor &correspondence_indices,
+                                 core::Tensor &pose,
+                                 float &residual,
+                                 int &inlier_count,
+                                 const core::Dtype &dtype,
+                                 const core::Device &device,
+                                 const registration::RobustKernel &kernel);
+
+void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
+                               const core::Tensor &source_colors,
+                               const core::Tensor &target_points,
+                               const core::Tensor &target_normals,
+                               const core::Tensor &target_colors,
+                               const core::Tensor &target_color_gradients,
+                               const core::Tensor &correspondence_indices,
+                               core::Tensor &pose,
+                               float &residual,
+                               int &inlier_count,
+                               const core::Dtype &dtype,
+                               const core::Device &device,
+                               const registration::RobustKernel &kernel,
+                               const double &lambda_geometric);
+
+void ComputePoseDopplerICPSYCL(
+        const core::Tensor &source_points,
+        const core::Tensor &source_dopplers,
+        const core::Tensor &source_directions,
+        const core::Tensor &target_points,
+        const core::Tensor &target_normals,
+        const core::Tensor &correspondence_indices,
+        core::Tensor &output_pose,
+        float &residual,
+        int &inlier_count,
+        const core::Dtype &dtype,
+        const core::Device &device,
+        const core::Tensor &R_S_to_V,
+        const core::Tensor &r_v_to_s_in_V,
+        const core::Tensor &w_v_in_V,
+        const core::Tensor &v_v_in_V,
+        const double period,
+        const bool reject_dynamic_outliers,
+        const double doppler_outlier_threshold,
+        const registration::RobustKernel &kernel_geometric,
+        const registration::RobustKernel &kernel_doppler,
+        const double lambda_doppler);
+
+void ComputeInformationMatrixSYCL(const core::Tensor &target_points,
+                                  const core::Tensor &correspondence_indices,
+                                  core::Tensor &information_matrix,
+                                  const core::Dtype &dtype,
+                                  const core::Device &device);
+#endif
+
 template <typename scalar_t>
 OPEN3D_HOST_DEVICE inline bool GetJacobianPointToPlane(
         int64_t workload_idx,
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
new file mode 100644
index 00000000000..c7503191fdb
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationSYCL.cpp
@@ -0,0 +1,554 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/t/pipelines/kernel/RegistrationImpl.h"
+#include "open3d/t/pipelines/kernel/TransformationConverter.h"
+#include "open3d/t/pipelines/registration/RobustKernel.h"
+#include "open3d/t/pipelines/registration/RobustKernelImpl.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+
+static constexpr int kReduceDim =
+        29;  // 21 (JtJ) + 6 (Jtr) + 1 (r) + 1 (inlier)
+
+/// Perform a group reduction over a fixed-size private array using
+/// sycl::reduce_over_group, then have work item 0 atomically accumulate each
+/// group partial sum into the device-side global buffer.
+template <int N, typename scalar_t>
+static inline void GroupReduceAndAdd(sycl::nd_item<1> item,
+                                     const scalar_t (&local_sum)[N],
+                                     scalar_t *global_sum_ptr) {
+    auto grp = item.get_group();
+    for (int k = 0; k < N; ++k) {
+        scalar_t grp_val = sycl::reduce_over_group(grp, local_sum[k],
+                                                   sycl::plus<scalar_t>{});
+        if (item.get_local_id(0) == 0) {
+            sycl::atomic_ref<scalar_t, sycl::memory_order::acq_rel,
+                             sycl::memory_scope::device>(global_sum_ptr[k]) +=
+                    grp_val;
+        }
+    }
+}
+
+void ComputePosePointToPlaneSYCL(const core::Tensor &source_points,
+                                 const core::Tensor &target_points,
+                                 const core::Tensor &target_normals,
+                                 const core::Tensor &correspondence_indices,
+                                 core::Tensor &pose,
+                                 float &residual,
+                                 int &inlier_count,
+                                 const core::Dtype &dtype,
+                                 const core::Device &device,
+                                 const registration::RobustKernel &kernel) {
+    const int n = source_points.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
+
+        DISPATCH_ROBUST_KERNEL_FUNCTION(
+                kernel.type_, scalar_t, kernel.scaling_parameter_,
+                kernel.shape_parameter_, [&]() {
+                    const scalar_t *source_points_ptr =
+                            source_points.GetDataPtr<scalar_t>();
+                    const scalar_t *target_points_ptr =
+                            target_points.GetDataPtr<scalar_t>();
+                    const scalar_t *target_normals_ptr =
+                            target_normals.GetDataPtr<scalar_t>();
+                    const int64_t *correspondence_indices_ptr =
+                            correspondence_indices.GetDataPtr<int64_t>();
+
+                    queue.submit([&](sycl::handler &cgh) {
+                             cgh.parallel_for(
+                                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                                     [=](sycl::nd_item<1> item) {
+                                         const int gid = item.get_global_id(0);
+                                         scalar_t local_sum[kReduceDim] = {};
+
+                                         if (gid < n) {
+                                             scalar_t J_ij[6] = {0};
+                                             scalar_t r = 0;
+                                             const bool valid = GetJacobianPointToPlane<
+                                                     scalar_t>(
+                                                     gid, source_points_ptr,
+                                                     target_points_ptr,
+                                                     target_normals_ptr,
+                                                     correspondence_indices_ptr,
+                                                     J_ij, r);
+
+                                             if (valid) {
+                                                 const scalar_t w =
+                                                         GetWeightFromRobustKernel(
+                                                                 r);
+                                                 int i = 0;
+                                                 for (int j = 0; j < 6; ++j) {
+                                                     for (int k = 0; k <= j;
+                                                          ++k) {
+                                                         local_sum[i++] +=
+                                                                 J_ij[j] * w *
+                                                                 J_ij[k];
+                                                     }
+                                                     local_sum[21 + j] +=
+                                                             J_ij[j] * w * r;
+                                                 }
+                                                 local_sum[27] += r;
+                                                 local_sum[28] += scalar_t(1);
+                                             }
+                                         }
+
+                                         GroupReduceAndAdd<kReduceDim,
+                                                           scalar_t>(
+                                                 item, local_sum,
+                                                 global_sum_ptr);
+                                     });
+                         }).wait_and_throw();
+                });
+    });
+
+    DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
+}
+
+void ComputePoseColoredICPSYCL(const core::Tensor &source_points,
+                               const core::Tensor &source_colors,
+                               const core::Tensor &target_points,
+                               const core::Tensor &target_normals,
+                               const core::Tensor &target_colors,
+                               const core::Tensor &target_color_gradients,
+                               const core::Tensor &correspondence_indices,
+                               core::Tensor &pose,
+                               float &residual,
+                               int &inlier_count,
+                               const core::Dtype &dtype,
+                               const core::Device &device,
+                               const registration::RobustKernel &kernel,
+                               const double &lambda_geometric) {
+    const int n = source_points.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        const scalar_t sqrt_lambda_geometric =
+                static_cast<scalar_t>(sqrt(lambda_geometric));
+        const scalar_t sqrt_lambda_photometric =
+                static_cast<scalar_t>(sqrt(1.0 - lambda_geometric));
+        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
+
+        DISPATCH_ROBUST_KERNEL_FUNCTION(
+                kernel.type_, scalar_t, kernel.scaling_parameter_,
+                kernel.shape_parameter_, [&]() {
+                    const scalar_t *source_points_ptr =
+                            source_points.GetDataPtr<scalar_t>();
+                    const scalar_t *source_colors_ptr =
+                            source_colors.GetDataPtr<scalar_t>();
+                    const scalar_t *target_points_ptr =
+                            target_points.GetDataPtr<scalar_t>();
+                    const scalar_t *target_normals_ptr =
+                            target_normals.GetDataPtr<scalar_t>();
+                    const scalar_t *target_colors_ptr =
+                            target_colors.GetDataPtr<scalar_t>();
+                    const scalar_t *target_color_gradients_ptr =
+                            target_color_gradients.GetDataPtr<scalar_t>();
+                    const int64_t *correspondence_indices_ptr =
+                            correspondence_indices.GetDataPtr<int64_t>();
+
+                    queue.submit([&](sycl::handler &cgh) {
+                             cgh.parallel_for(
+                                     sycl::nd_range<1>{num_groups * wgs, wgs},
+                                     [=](sycl::nd_item<1> item) {
+                                         const int gid = item.get_global_id(0);
+                                         scalar_t local_sum[kReduceDim] = {};
+
+                                         if (gid < n) {
+                                             scalar_t J_G[6] = {0},
+                                                      J_I[6] = {0};
+                                             scalar_t r_G = 0, r_I = 0;
+
+                                             const bool valid = GetJacobianColoredICP<
+                                                     scalar_t>(
+                                                     gid, source_points_ptr,
+                                                     source_colors_ptr,
+                                                     target_points_ptr,
+                                                     target_normals_ptr,
+                                                     target_colors_ptr,
+                                                     target_color_gradients_ptr,
+                                                     correspondence_indices_ptr,
+                                                     sqrt_lambda_geometric,
+                                                     sqrt_lambda_photometric,
+                                                     J_G, J_I, r_G, r_I);
+
+                                             if (valid) {
+                                                 const scalar_t w_G =
+                                                         GetWeightFromRobustKernel(
+                                                                 r_G);
+                                                 const scalar_t w_I =
+                                                         GetWeightFromRobustKernel(
+                                                                 r_I);
+
+                                                 int i = 0;
+                                                 for (int j = 0; j < 6; ++j) {
+                                                     for (int k = 0; k <= j;
+                                                          ++k) {
+                                                         local_sum[i++] +=
+                                                                 J_G[j] * w_G *
+                                                                         J_G[k] +
+                                                                 J_I[j] * w_I *
+                                                                         J_I[k];
+                                                     }
+                                                     local_sum[21 + j] +=
+                                                             J_G[j] * w_G *
+                                                                     r_G +
+                                                             J_I[j] * w_I * r_I;
+                                                 }
+                                                 local_sum[27] +=
+                                                         r_G * r_G + r_I * r_I;
+                                                 local_sum[28] += scalar_t(1);
+                                             }
+                                         }
+
+                                         GroupReduceAndAdd<kReduceDim,
+                                                           scalar_t>(
+                                                 item, local_sum,
+                                                 global_sum_ptr);
+                                     });
+                         }).wait_and_throw();
+                });
+    });
+
+    DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
+}
+
+void ComputePoseDopplerICPSYCL(
+        const core::Tensor &source_points,
+        const core::Tensor &source_dopplers,
+        const core::Tensor &source_directions,
+        const core::Tensor &target_points,
+        const core::Tensor &target_normals,
+        const core::Tensor &correspondence_indices,
+        core::Tensor &output_pose,
+        float &residual,
+        int &inlier_count,
+        const core::Dtype &dtype,
+        const core::Device &device,
+        const core::Tensor &R_S_to_V,
+        const core::Tensor &r_v_to_s_in_V,
+        const core::Tensor &w_v_in_V,
+        const core::Tensor &v_v_in_V,
+        const double period,
+        const bool reject_dynamic_outliers,
+        const double doppler_outlier_threshold,
+        const registration::RobustKernel &kernel_geometric,
+        const registration::RobustKernel &kernel_doppler,
+        const double lambda_doppler) {
+    const int n = source_points.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({kReduceDim}, dtype, device);
+    core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device);
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype,
+                                     [&]() {
+                                         const scalar_t sqrt_lambda_geometric =
+                                                 sqrt(1.0 -
+                                                      static_cast<scalar_t>(
+                                                              lambda_doppler));
+                                         const scalar_t sqrt_lambda_doppler =
+                                                 sqrt(lambda_doppler);
+                                         const scalar_t
+                                                 sqrt_lambda_doppler_by_dt =
+                                                         sqrt_lambda_doppler /
+                                                         static_cast<scalar_t>(
+                                                                 period);
+
+                                         // Pre-compute v_s_in_S using a
+                                         // single-task kernel.
+                                         {
+                                             const scalar_t *R_S_to_V_ptr =
+                                                     R_S_to_V.GetDataPtr<
+                                                             scalar_t>();
+                                             const scalar_t *r_v_to_s_in_V_ptr =
+                                                     r_v_to_s_in_V.GetDataPtr<
+                                                             scalar_t>();
+                                             const scalar_t *w_v_in_V_ptr =
+                                                     w_v_in_V.GetDataPtr<
+                                                             scalar_t>();
+                                             const scalar_t *v_v_in_V_ptr =
+                                                     v_v_in_V.GetDataPtr<
+                                                             scalar_t>();
+                                             scalar_t *v_s_in_S_ptr =
+                                                     v_s_in_S.GetDataPtr<
+                                                             scalar_t>();
+                                             queue.single_task([=]() {
+                                                      PreComputeForDopplerICP(
+                                                              R_S_to_V_ptr,
+                                                              r_v_to_s_in_V_ptr,
+                                                              w_v_in_V_ptr,
+                                                              v_v_in_V_ptr,
+                                                              v_s_in_S_ptr);
+                                                  }).wait_and_throw();
+                                         }
+
+                                         scalar_t *global_sum_ptr =
+                                                 global_sum.GetDataPtr<
+                                                         scalar_t>();
+
+                                         DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(scalar_t,
+                                                                              kernel_geometric
+                                                                                      .type_,
+                                                                              kernel_geometric
+                                                                                      .scaling_parameter_,
+                                                                              kernel_doppler
+                                                                                      .type_,
+                                                                              kernel_doppler
+                                                                                      .scaling_parameter_,
+                                                                              [&]() {
+                                                                                  const scalar_t *source_points_ptr =
+                                                                                          source_points
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *source_dopplers_ptr =
+                                                                                          source_dopplers
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *source_directions_ptr =
+                                                                                          source_directions
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *target_points_ptr =
+                                                                                          target_points
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *target_normals_ptr =
+                                                                                          target_normals
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const int64_t *correspondence_indices_ptr =
+                                                                                          correspondence_indices
+                                                                                                  .GetDataPtr<
+                                                                                                          int64_t>();
+                                                                                  const scalar_t *R_S_to_V_ptr =
+                                                                                          R_S_to_V.GetDataPtr<
+                                                                                                  scalar_t>();
+                                                                                  const scalar_t *r_v_to_s_in_V_ptr =
+                                                                                          r_v_to_s_in_V
+                                                                                                  .GetDataPtr<
+                                                                                                          scalar_t>();
+                                                                                  const scalar_t *v_s_in_S_ptr =
+                                                                                          v_s_in_S.GetDataPtr<
+                                                                                                  scalar_t>();
+
+                                                                                  queue
+                                                                                          .submit([&](sycl::handler
+                                                                                                              &cgh) {
+                                                                                              cgh.parallel_for(
+                                                                                                      sycl::nd_range<
+                                                                                                              1>{
+                                                                                                              num_groups *
+                                                                                                                      wgs,
+                                                                                                              wgs},
+                                                                                                      [=](sycl::nd_item<
+                                                                                                              1> item) {
+                                                                                                          const int gid =
+                                                                                                                  item.get_global_id(
+                                                                                                                          0);
+                                                                                                          scalar_t local_sum
+                                                                                                                  [kReduceDim] =
+                                                                                                                          {};
+
+                                                                                                          if (gid <
+                                                                                                              n) {
+                                                                                                              scalar_t J_G
+                                                                                                                      [6] = {0},
+                                                                                                                   J_D[6] = {
+                                                                                                                           0};
+                                                                                                              scalar_t
+                                                                                                                      r_G = 0,
+                                                                                                                      r_D = 0;
+
+                                                                                                              const bool valid = GetJacobianDopplerICP<
+                                                                                                                      scalar_t>(
+                                                                                                                      gid,
+                                                                                                                      source_points_ptr,
+                                                                                                                      source_dopplers_ptr,
+                                                                                                                      source_directions_ptr,
+                                                                                                                      target_points_ptr,
+                                                                                                                      target_normals_ptr,
+                                                                                                                      correspondence_indices_ptr,
+                                                                                                                      R_S_to_V_ptr,
+                                                                                                                      r_v_to_s_in_V_ptr,
+                                                                                                                      v_s_in_S_ptr,
+                                                                                                                      reject_dynamic_outliers,
+                                                                                                                      static_cast<
+                                                                                                                              scalar_t>(
+                                                                                                                              doppler_outlier_threshold),
+                                                                                                                      sqrt_lambda_geometric,
+                                                                                                                      sqrt_lambda_doppler,
+                                                                                                                      sqrt_lambda_doppler_by_dt,
+                                                                                                                      J_G,
+                                                                                                                      J_D,
+                                                                                                                      r_G,
+                                                                                                                      r_D);
+
+                                                                                                              if (valid) {
+                                                                                                                  const scalar_t
+                                                                                                                          w_G = GetWeightFromRobustKernelFirst(
+                                                                                                                                  r_G);
+                                                                                                                  const scalar_t
+                                                                                                                          w_D = GetWeightFromRobustKernelSecond(
+                                                                                                                                  r_D);
+
+                                                                                                                  int i = 0;
+                                                                                                                  for (int j = 0;
+                                                                                                                       j <
+                                                                                                                       6;
+                                                                                                                       ++j) {
+                                                                                                                      for (int k = 0;
+                                                                                                                           k <=
+                                                                                                                           j;
+                                                                                                                           ++k) {
+                                                                                                                          local_sum
+                                                                                                                                  [i++] +=
+                                                                                                                                  J_G[j] *
+                                                                                                                                          w_G *
+                                                                                                                                          J_G[k] +
+                                                                                                                                  J_D[j] *
+                                                                                                                                          w_D *
+                                                                                                                                          J_D[k];
+                                                                                                                      }
+                                                                                                                      local_sum
+                                                                                                                              [21 +
+                                                                                                                               j] +=
+                                                                                                                              J_G[j] *
+                                                                                                                                      w_G *
+                                                                                                                                      r_G +
+                                                                                                                              J_D[j] *
+                                                                                                                                      w_D *
+                                                                                                                                      r_D;
+                                                                                                                  }
+                                                                                                                  local_sum
+                                                                                                                          [27] +=
+                                                                                                                          r_G * r_G +
+                                                                                                                          r_D * r_D;
+                                                                                                                  local_sum[28] += scalar_t(
+                                                                                                                          1);
+                                                                                                              }
+                                                                                                          }
+
+                                                                                                          GroupReduceAndAdd<
+                                                                                                                  kReduceDim,
+                                                                                                                  scalar_t>(
+                                                                                                                  item,
+                                                                                                                  local_sum,
+                                                                                                                  global_sum_ptr);
+                                                                                                      });
+                                                                                          })
+                                                                                          .wait_and_throw();
+                                                                              });
+                                     });
+
+    DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count);
+}
+
+void ComputeInformationMatrixSYCL(const core::Tensor &target_points,
+                                  const core::Tensor &correspondence_indices,
+                                  core::Tensor &information_matrix,
+                                  const core::Dtype &dtype,
+                                  const core::Device &device) {
+    const int n = correspondence_indices.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device);
+
+    auto device_props =
+            core::sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    sycl::queue queue = device_props.queue;
+    const size_t wgs = device_props.max_work_group_size;
+    const size_t num_groups = ((size_t)n + wgs - 1) / wgs;
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
+        const scalar_t *target_points_ptr =
+                target_points.GetDataPtr<scalar_t>();
+        const int64_t *correspondence_indices_ptr =
+                correspondence_indices.GetDataPtr<int64_t>();
+
+        queue.submit([&](sycl::handler &cgh) {
+                 cgh.parallel_for(
+                         sycl::nd_range<1>{num_groups * wgs, wgs},
+                         [=](sycl::nd_item<1> item) {
+                             const int gid = item.get_global_id(0);
+                             scalar_t local_sum[21] = {};
+
+                             if (gid < n) {
+                                 scalar_t J_x[6] = {0}, J_y[6] = {0},
+                                          J_z[6] = {0};
+                                 const bool valid =
+                                         GetInformationJacobians<scalar_t>(
+                                                 gid, target_points_ptr,
+                                                 correspondence_indices_ptr,
+                                                 J_x, J_y, J_z);
+
+                                 if (valid) {
+                                     int i = 0;
+                                     for (int j = 0; j < 6; ++j) {
+                                         for (int k = 0; k <= j; ++k) {
+                                             local_sum[i++] += J_x[j] * J_x[k] +
+                                                               J_y[j] * J_y[k] +
+                                                               J_z[j] * J_z[k];
+                                         }
+                                     }
+                                 }
+                             }
+
+                             GroupReduceAndAdd<21, scalar_t>(item, local_sum,
+                                                             global_sum_ptr);
+                         });
+             }).wait_and_throw();
+
+        const core::Device host(core::Device("CPU:0"));
+        core::Tensor global_sum_cpu = global_sum.To(host, core::Float64);
+        double *sum_ptr = global_sum_cpu.GetDataPtr<double>();
+
+        // Information matrix is on CPU of type Float64.
+        double *GTG_ptr = information_matrix.GetDataPtr<double>();
+
+        int i = 0;
+        for (int j = 0; j < 6; j++) {
+            for (int k = 0; k <= j; k++) {
+                GTG_ptr[j * 6 + k] = GTG_ptr[k * 6 + j] = sum_ptr[i];
+                ++i;
+            }
+        }
+    });
+}
+
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp
index c181ade770e..5526e2f1f59 100644
--- a/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp
+++ b/cpp/open3d/t/pipelines/kernel/TransformationConverter.cpp
@@ -63,6 +63,13 @@ static void PoseToTransformationDevice(
         PoseToTransformationCUDA<scalar_t>(transformation_ptr, pose_ptr);
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (device_type == core::Device::DeviceType::SYCL) {
+#ifdef BUILD_SYCL_MODULE
+        PoseToTransformationSYCL<scalar_t>(transformation_ptr, pose_ptr,
+                                           transformation.GetDevice());
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device.");
@@ -109,6 +116,13 @@ static void TransformationToPoseDevice(
         TransformationToPoseCUDA<scalar_t>(pose_ptr, transformation_ptr);
 #else
         utility::LogError("Not compiled with CUDA, but CUDA device is used.");
+#endif
+    } else if (device_type == core::Device::DeviceType::SYCL) {
+#ifdef BUILD_SYCL_MODULE
+        TransformationToPoseSYCL<scalar_t>(pose_ptr, transformation_ptr,
+                                           pose.GetDevice());
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
     } else {
         utility::LogError("Unimplemented device.");
diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
index e75484ce3f5..6ee52ee8ab5 100644
--- a/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterImpl.h
@@ -77,6 +77,26 @@ void TransformationToPoseCUDA(scalar_t *pose_ptr,
                               const scalar_t *transformation_ptr);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+/// \brief Helper function for PoseToTransformationSYCL.
+/// Do not call this independently, as it only sets the transformation part
+/// in transformation matrix, using the Pose, the rest is set in
+/// the parent function PoseToTransformation.
+template <typename scalar_t>
+void PoseToTransformationSYCL(scalar_t *transformation_ptr,
+                              const scalar_t *pose_ptr,
+                              const core::Device &device);
+
+/// \brief Helper function for TransformationToPoseSYCL.
+/// Do not call this independently, as it only sets the rotation part in the
+/// pose, using the Transformation, the rest is set in the parent function
+/// TransformationToPose.
+template <typename scalar_t>
+void TransformationToPoseSYCL(scalar_t *pose_ptr,
+                              const scalar_t *transformation_ptr,
+                              const core::Device &device);
+#endif
+
 }  // namespace kernel
 }  // namespace pipelines
 }  // namespace t
diff --git a/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp
new file mode 100644
index 00000000000..13eb4490465
--- /dev/null
+++ b/cpp/open3d/t/pipelines/kernel/TransformationConverterSYCL.cpp
@@ -0,0 +1,56 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Device.h"
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/t/pipelines/kernel/TransformationConverterImpl.h"
+
+namespace open3d {
+namespace t {
+namespace pipelines {
+namespace kernel {
+
+template <typename scalar_t>
+void PoseToTransformationSYCL(scalar_t *transformation_ptr,
+                              const scalar_t *pose_ptr,
+                              const core::Device &device) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.single_task([=]() {
+             PoseToTransformationImpl(transformation_ptr, pose_ptr);
+         }).wait_and_throw();
+}
+
+template <typename scalar_t>
+void TransformationToPoseSYCL(scalar_t *pose_ptr,
+                              const scalar_t *transformation_ptr,
+                              const core::Device &device) {
+    sycl::queue queue =
+            core::sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    queue.single_task([=]() {
+             TransformationToPoseImpl(pose_ptr, transformation_ptr);
+         }).wait_and_throw();
+}
+
+template void PoseToTransformationSYCL<float>(float *,
+                                              const float *,
+                                              const core::Device &);
+template void PoseToTransformationSYCL<double>(double *,
+                                               const double *,
+                                               const core::Device &);
+template void TransformationToPoseSYCL<float>(float *,
+                                              const float *,
+                                              const core::Device &);
+template void TransformationToPoseSYCL<double>(double *,
+                                               const double *,
+                                               const core::Device &);
+
+}  // namespace kernel
+}  // namespace pipelines
+}  // namespace t
+}  // namespace open3d
diff --git a/cpp/pybind/core/nns/nearest_neighbor_search.cpp b/cpp/pybind/core/nns/nearest_neighbor_search.cpp
index 3bb647592a0..99394d8681a 100644
--- a/cpp/pybind/core/nns/nearest_neighbor_search.cpp
+++ b/cpp/pybind/core/nns/nearest_neighbor_search.cpp
@@ -84,7 +84,7 @@ This function needs to be called once before performing search operations.
 
 Args:
     radius (float, optional): Radius value for fixed-radius search. Required
-        for GPU fixed radius index.
+        for CUDA and SYCL fixed radius indices.
 
 Returns:
     True on success.
@@ -115,7 +115,7 @@ This function needs to be called once before performing search operations.
 
 Args:
     radius (float, optional): Radius value for hybrid search. Required
-        for GPU hybrid index.
+        for CUDA and SYCL hybrid indices.
 
 Returns:
     True on success.
@@ -130,7 +130,8 @@ This function needs to be called once before performing search operations.
     To use knn_search initialize the index using knn_index before calling this function.
 
 Args:
-    query_points (open3d.core.Tensor): Query points with shape {n, d}.
+    query_points (open3d.core.Tensor): Query points with shape {n, d}. CPU,
+        CUDA, and SYCL devices are supported.
     knn (int): Number of neighbors to search per query point.
 
 Example:
@@ -179,6 +180,7 @@ This function needs to be called once before performing search operations.
 
 Args:
         query_points (open3d.core.Tensor): Query points with shape {n, d}.
+            CPU, CUDA, and SYCL devices are supported.
         radius (float): Radius value for fixed-radius search. Note that this
             parameter can differ from the radius used to initialize the index
             for convenience, which may cause the index to be rebuilt for GPU 
@@ -234,7 +236,8 @@ This function needs to be called once before performing search operations.
     To use multi_radius_search initialize the index using multi_radius_index before calling this function.
 
 Args:
-    query_points (open3d.core.Tensor): Query points with shape {n, d}.
+    query_points (open3d.core.Tensor): Query points with shape {n, d}. Only
+        CPU tensors are supported.
     radii (open3d.core.Tensor): Radii of query points. Each query point has one radius.
 
 Returns:
@@ -292,7 +295,8 @@ Hybrid search behaves similarly to fixed-radius search, but with a maximum numbe
     To use hybrid_search initialize the index using hybrid_index before calling this function.
 
 Args:
-    query_points (open3d.core.Tensor): Query points with shape {n, d}.
+    query_points (open3d.core.Tensor): Query points with shape {n, d}. CPU,
+        CUDA, and SYCL devices are supported.
     radius (float): Radius value for hybrid search.
     max_knn (int): Maximum number of neighbor to search per query.
 
diff --git a/cpp/tests/core/CMakeLists.txt b/cpp/tests/core/CMakeLists.txt
index a1c4826d0fa..1d71b53455c 100644
--- a/cpp/tests/core/CMakeLists.txt
+++ b/cpp/tests/core/CMakeLists.txt
@@ -22,10 +22,15 @@ target_sources(tests PRIVATE
     SYCLUtils.cpp
 )
 
+if (BUILD_CUDA_MODULE OR BUILD_SYCL_MODULE)
+    target_sources(tests PRIVATE
+        KnnIndex.cpp
+    )
+endif()
+
 if (BUILD_CUDA_MODULE)
     target_sources(tests PRIVATE
         FixedRadiusIndex.cpp
-        KnnIndex.cpp
         ParallelFor.cu
     )
 endif()
@@ -38,10 +43,8 @@ if (BUILD_ISPC_MODULE)
 endif()
 
 # TODO: cmake does not currently build this test!
-# if (BUILD_SYCL_MODULE)
-#     target_sources(tests PRIVATE
-#         ParallelForSYCL.cpp
-#     )
-#     set_source_files_properties(ParallelForSYCL.cpp PROPERTIES
-#         COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64_gen")
-# endif()
\ No newline at end of file
+#if (BUILD_SYCL_MODULE)
+#    open3d_sycl_target_sources(tests PRIVATE
+#        ParallelForSYCL.cpp
+#    )
+#endif()
\ No newline at end of file
diff --git a/cpp/tests/core/KnnIndex.cpp b/cpp/tests/core/KnnIndex.cpp
index 1ceb32d4835..a5d63beebd1 100644
--- a/cpp/tests/core/KnnIndex.cpp
+++ b/cpp/tests/core/KnnIndex.cpp
@@ -24,9 +24,18 @@ using namespace std;
 namespace open3d {
 namespace tests {
 
-TEST(KnnIndex, KnnSearch) {
+class KnnIndexPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        KnnIndex,
+        KnnIndexPermuteDevices,
+        testing::ValuesIn(KnnIndexPermuteDevices::TestCases()));
+
+TEST_P(KnnIndexPermuteDevices, KnnSearch) {
     // Define test data.
-    core::Device device = core::Device("CUDA:0");
+    core::Device device = GetParam();
+    if (device.IsCPU()) {
+        GTEST_SKIP() << "KnnIndex does not support CPU device.";
+    }
     core::Tensor dataset_points = core::Tensor::Init<float>({{0.0, 0.0, 0.0},
                                                              {0.0, 0.0, 0.1},
                                                              {0.0, 0.0, 0.2},
@@ -149,9 +158,12 @@ TEST(KnnIndex, KnnSearch) {
     EXPECT_TRUE(distances.AllClose(gt_distances));
 }
 
-TEST(KnnIndex, KnnSearchHighdim) {
+TEST_P(KnnIndexPermuteDevices, KnnSearchHighdim) {
     // Define test data.
-    core::Device device = core::Device("CUDA:0");
+    core::Device device = GetParam();
+    if (device.IsCPU()) {
+        GTEST_SKIP() << "KnnIndex does not support CPU device.";
+    }
     core::Tensor dataset_points = core::Tensor::Init<float>({{0.0, 0.0, 0.0},
                                                              {0.0, 0.0, 0.1},
                                                              {0.0, 0.0, 0.2},
@@ -231,9 +243,12 @@ TEST(KnnIndex, KnnSearchHighdim) {
     EXPECT_TRUE(distances64.AllClose(gt_distances));
 }
 
-TEST(KnnIndex, KnnSearchBatch) {
+TEST_P(KnnIndexPermuteDevices, KnnSearchBatch) {
     // Define test data.
-    core::Device device = core::Device("CUDA:0");
+    core::Device device = GetParam();
+    if (device.IsCPU()) {
+        GTEST_SKIP() << "KnnIndex does not support CPU device.";
+    }
     core::Tensor dataset_points = core::Tensor::Init<float>(
             {{0.719, 0.128, 0.431}, {0.764, 0.970, 0.678},
              {0.692, 0.786, 0.211}, {0.692, 0.969, 0.942},
@@ -303,8 +318,9 @@ TEST(KnnIndex, KnnSearchBatch) {
             index32.SearchKnn(query_points, queries_row_splits, 3);
     EXPECT_EQ(indices.GetShape(), shape);
     EXPECT_EQ(distances.GetShape(), shape);
-    EXPECT_TRUE(indices.AllClose(gt_indices));
-    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3));
+    EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString();
+    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3))
+            << distances.ToString();
 
     // int64
     // Set up Knn index.
@@ -327,8 +343,9 @@ TEST(KnnIndex, KnnSearchBatch) {
             index64.SearchKnn(query_points, queries_row_splits, 3);
     EXPECT_EQ(indices.GetShape(), shape);
     EXPECT_EQ(distances.GetShape(), shape);
-    EXPECT_TRUE(indices.AllClose(gt_indices));
-    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3));
+    EXPECT_TRUE(indices.AllClose(gt_indices)) << indices.ToString();
+    EXPECT_TRUE(distances.AllClose(gt_distances, 1e-5, 1e-3))
+            << distances.ToString();
 }
 
 }  // namespace tests
diff --git a/cpp/tests/core/NearestNeighborSearch.cpp b/cpp/tests/core/NearestNeighborSearch.cpp
index d282e8ecd9a..547d3829a06 100644
--- a/cpp/tests/core/NearestNeighborSearch.cpp
+++ b/cpp/tests/core/NearestNeighborSearch.cpp
@@ -22,7 +22,7 @@
 namespace open3d {
 namespace tests {
 
-class NNSPermuteDevices : public PermuteDevices {};
+class NNSPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(NearestNeighborSearch,
                          NNSPermuteDevices,
                          testing::ValuesIn(PermuteDevices::TestCases()));
diff --git a/cpp/tests/t/pipelines/TransformationConverter.cpp b/cpp/tests/t/pipelines/TransformationConverter.cpp
index 5d8bd0d8434..a4c092c62d6 100644
--- a/cpp/tests/t/pipelines/TransformationConverter.cpp
+++ b/cpp/tests/t/pipelines/TransformationConverter.cpp
@@ -14,10 +14,11 @@
 namespace open3d {
 namespace tests {
 
-class TransformationConverterPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TransformationConverter,
-                         TransformationConverterPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TransformationConverterPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TransformationConverter,
+        TransformationConverterPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(TransformationConverterPermuteDevices, RtToTransformation) {
     core::Device device = GetParam();
diff --git a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
index 3bb7ee36142..64d2a97d4a7 100644
--- a/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
+++ b/cpp/tests/t/pipelines/odometry/RGBDOdometry.cpp
@@ -21,10 +21,11 @@
 namespace open3d {
 namespace tests {
 
-class OdometryPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(Odometry,
-                         OdometryPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class OdometryPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        Odometry,
+        OdometryPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 core::Tensor CreateIntrisicTensor() {
     camera::PinholeCameraIntrinsic intrinsic = camera::PinholeCameraIntrinsic(
diff --git a/cpp/tests/t/pipelines/registration/Feature.cpp b/cpp/tests/t/pipelines/registration/Feature.cpp
index a3741a3ed8a..7cb5f27ea8c 100644
--- a/cpp/tests/t/pipelines/registration/Feature.cpp
+++ b/cpp/tests/t/pipelines/registration/Feature.cpp
@@ -20,10 +20,11 @@
 namespace open3d {
 namespace tests {
 
-class FeaturePermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(Feature,
-                         FeaturePermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class FeaturePermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        Feature,
+        FeaturePermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(FeaturePermuteDevices, SelectByIndex) {
     core::Device device = GetParam();
diff --git a/cpp/tests/t/pipelines/registration/Registration.cpp b/cpp/tests/t/pipelines/registration/Registration.cpp
index d823394c6bf..d3d0544e27c 100644
--- a/cpp/tests/t/pipelines/registration/Registration.cpp
+++ b/cpp/tests/t/pipelines/registration/Registration.cpp
@@ -26,10 +26,11 @@ namespace l_reg = open3d::pipelines::registration;
 namespace open3d {
 namespace tests {
 
-class RegistrationPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(Registration,
-                         RegistrationPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class RegistrationPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        Registration,
+        RegistrationPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(RegistrationPermuteDevices, ICPConvergenceCriteriaConstructor) {
     // Constructor.
diff --git a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
index 2daca846d0a..afc7f40e7a8 100644
--- a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
+++ b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp
@@ -15,10 +15,11 @@
 namespace open3d {
 namespace tests {
 
-class TransformationEstimationPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TransformationEstimation,
-                         TransformationEstimationPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TransformationEstimationPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TransformationEstimation,
+        TransformationEstimationPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 static std::
         tuple<t::geometry::PointCloud, t::geometry::PointCloud, core::Tensor>
diff --git a/cpp/tests/t/pipelines/slac/ControlGrid.cpp b/cpp/tests/t/pipelines/slac/ControlGrid.cpp
index 98bce9d5ef2..7f716f553ee 100644
--- a/cpp/tests/t/pipelines/slac/ControlGrid.cpp
+++ b/cpp/tests/t/pipelines/slac/ControlGrid.cpp
@@ -24,10 +24,11 @@ static t::geometry::PointCloud CreateTPCDFromFile(
     return t::geometry::PointCloud::FromLegacy(*pcd, core::Float32, device);
 }
 
-class ControlGridPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(ControlGrid,
-                         ControlGridPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class ControlGridPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        ControlGrid,
+        ControlGridPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 // TODO(wei): more well-designed test cases
 TEST_P(ControlGridPermuteDevices, Touch) {
diff --git a/cpp/tests/t/pipelines/slac/SLAC.cpp b/cpp/tests/t/pipelines/slac/SLAC.cpp
index bc31cb31044..00926a38741 100644
--- a/cpp/tests/t/pipelines/slac/SLAC.cpp
+++ b/cpp/tests/t/pipelines/slac/SLAC.cpp
@@ -27,10 +27,11 @@
 namespace open3d {
 namespace tests {
 
-class SLACPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(SLAC,
-                         SLACPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class SLACPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        SLAC,
+        SLACPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 // PointCloud is similar if fitness is higher and rmse is lower than tolerance
 // threshold.
diff --git a/python/test/core/test_nn.py b/python/test/core/test_nn.py
index 48b7d2682cd..b5376fec105 100644
--- a/python/test/core/test_nn.py
+++ b/python/test/core/test_nn.py
@@ -19,7 +19,7 @@
 np.random.seed(0)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_knn_index(device):
     dtype = o3c.float32
 
@@ -34,7 +34,7 @@ def test_knn_index(device):
         assert nns.multi_radius_index()
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_knn_search(device):
     dtype = o3c.float32
 
@@ -107,7 +107,7 @@ def test_knn_search(device):
                                rtol=1e-5)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64])
 def test_fixed_radius_search(device, dtype):
     dataset_points = o3c.Tensor(