dylanllim · dylanllim · Feb 25, 2025 · Feb 25, 2025
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
@@ -267,6 +267,12 @@ std::vector<real_type_t<DT> const *>
   return out;
 }
 
+bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a,
+                            GenericTensorAccessorR const &accessor_b);
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b);
+
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
@@ -280,9 +286,6 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor);
 
-void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
-                                    GenericTensorAccessorR const &src_accessor);
-
 } // namespace FlexFlow
 
 namespace FlexFlow {

diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h
@@ -6,6 +6,9 @@
 
 namespace FlexFlow {
 
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+                                    GenericTensorAccessorR const &src_accessor);
+
 GenericTensorAccessorR
     copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
                            Allocator &allocator);
@@ -14,6 +17,14 @@ GenericTensorAccessorW
     copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
                            Allocator &allocator);
 
+GenericTensorAccessorW
+    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
+                                        Allocator &allocator);
+
+GenericTensorAccessorR
+    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
+                                        Allocator &allocator);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
@@ -1,38 +1,45 @@
 #include "kernels/accessor.h"
-#include "kernels/allocation.h"
+#include "kernels/copy_tensor_accessor.h"
 #include "kernels/datatype_dispatch.h"
+#include "kernels/local_cpu_allocator.h"
+#include <cstring>
+#include <iostream>
 
 namespace FlexFlow {
 
-void copy_accessor_data_to_l_from_r(
-    GenericTensorAccessorW &dst_accessor,
-    GenericTensorAccessorR const &src_accessor) {
-  size_t num_bytes =
-      dst_accessor.shape.get_volume().unwrap_nonnegative() *
-      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
-
-  DeviceType dst_device_type = dst_accessor.device_type;
-  DeviceType src_device_type = src_accessor.device_type;
-
-  if (src_device_type == DeviceType::CPU &&
-      dst_device_type == DeviceType::CPU) {
-    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
-  } else if (src_device_type == DeviceType::CPU &&
-             dst_device_type == DeviceType::GPU) {
-    checkCUDA(cudaMemcpy(
-        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
-  } else if (src_device_type == DeviceType::GPU &&
-             dst_device_type == DeviceType::CPU) {
-    checkCUDA(cudaMemcpy(
-        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
-  } else {
-    assert(src_device_type == DeviceType::GPU);
-    assert(dst_device_type == DeviceType::GPU);
-    checkCUDA(cudaMemcpy(dst_accessor.ptr,
-                         src_accessor.ptr,
-                         num_bytes,
-                         cudaMemcpyDeviceToDevice));
+template <DataType DT>
+struct AccessorDataIsEqual {
+  bool operator()(GenericTensorAccessorR const &a,
+                  GenericTensorAccessorR const &b) {
+    int const num_elements = a.shape.num_elements().unwrap_nonnegative();
+    if (num_elements != b.shape.num_elements().unwrap_nonnegative()) {
+      return false;
+    }
+
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    auto cpu_a = copy_accessor_r_to_cpu_if_necessary(a, cpu_allocator);
+    auto cpu_b = copy_accessor_r_to_cpu_if_necessary(b, cpu_allocator);
+
+    using T = real_type_t<DT>;
+    T const *a_ptr = cpu_a.get<DT>();
+    T const *b_ptr = cpu_b.get<DT>();
+
+    return std::equal(a_ptr, a_ptr + num_elements, b_ptr);
   }
+};
+
+bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a,
+                            GenericTensorAccessorR const &accessor_b) {
+  return DataTypeDispatch1<AccessorDataIsEqual>{}(
+      accessor_a.data_type, accessor_a, accessor_b);
+}
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b) {
+  return accessor_a.data_type == accessor_b.data_type &&
+         accessor_a.device_type == accessor_b.device_type &&
+         accessor_a.shape == accessor_b.shape &&
+         accessor_data_is_equal(accessor_a, accessor_b);
 }
 
 GenericTensorAccessorW::operator GenericTensorAccessorR() const {
@@ -56,12 +63,12 @@ std::tuple<DataType const &,
 
 bool GenericTensorAccessorW::operator==(
     GenericTensorAccessorW const &other) const {
-  return this->tie() == other.tie();
+  return accessors_are_equal(*this, other);
 }
 
 bool GenericTensorAccessorW::operator!=(
     GenericTensorAccessorW const &other) const {
-  return this->tie() != other.tie();
+  return !(accessors_are_equal(*this, other));
 }
 
 int32_t *GenericTensorAccessorW::get_int32_ptr() const {
@@ -112,12 +119,12 @@ std::tuple<DataType const &,
 
 bool GenericTensorAccessorR::operator==(
     GenericTensorAccessorR const &other) const {
-  return this->tie() == other.tie();
+  return accessors_are_equal(*this, other);
 }
 
 bool GenericTensorAccessorR::operator!=(
     GenericTensorAccessorR const &other) const {
-  return this->tie() != other.tie();
+  return !(accessors_are_equal(*this, other));
 }
 
 int32_t const *GenericTensorAccessorR::get_int32_ptr() const {

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
@@ -51,18 +51,40 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
+legion_dim_t ArrayShape::last_idx() const {
+  if (this->dims.size() == 0) {
+    throw mk_runtime_error("Cannot get last index of an empty shape");
+  }
+  return legion_dim_t(nonnegative_int{this->dims.size() - 1});
+}
+
+legion_dim_t ArrayShape::neg_idx(int idx) const {
+  if (std::abs(idx) > this->dims.size()) {
+    throw mk_runtime_error(
+        fmt::format("Invalid negative index: {} (shape has {} dimensions)",
+                    idx,
+                    this->dims.size()));
+  }
+
+  if (idx >= 0) {
+    throw mk_runtime_error(fmt::format(
+        "Idx should be negative for negative indexing, got {}", idx));
+  }
+
+  return legion_dim_t(nonnegative_int{this->dims.size() + idx});
+}
+
 bool ArrayShape::operator==(ArrayShape const &other) const {
-  return this->tie() == other.tie();
+  return this->dims == other.dims;
 }
 
 bool ArrayShape::operator!=(ArrayShape const &other) const {
-  return this->tie() != other.tie();
+  return !(this->dims == other.dims);
 }
 
 ArrayShape ArrayShape::sub_shape(
     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-
   nonnegative_int num_dims = this->num_dims();
 
   auto to_legion_index = [num_dims](auto arg) -> nonnegative_int {
@@ -85,7 +107,9 @@ ArrayShape ArrayShape::sub_shape(
         "Invalid sub_shape range: start={}, end={}", start_idx, end_idx));
   }
 
-  return ArrayShape(&this->dims[legion_dim_t{start_idx}], end_idx - start_idx);
+  return ArrayShape(std::vector<nonnegative_int>(
+      this->dims.begin() + start_idx.unwrap_nonnegative(),
+      this->dims.begin() + end_idx.unwrap_nonnegative()));
 }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
@@ -97,7 +121,11 @@ std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
 }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(ff_dim_t index) const {
-  return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
+  if (index.value < this->num_dims()) {
+    return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
+  } else {
+    return std::nullopt;
+  }
 }
 
 std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {

diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc
@@ -3,6 +3,37 @@
 
 namespace FlexFlow {
 
+void copy_accessor_data_to_l_from_r(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor) {
+  size_t num_bytes =
+      dst_accessor.shape.get_volume().unwrap_nonnegative() *
+      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
+
+  DeviceType dst_device_type = dst_accessor.device_type;
+  DeviceType src_device_type = src_accessor.device_type;
+
+  if (src_device_type == DeviceType::CPU &&
+      dst_device_type == DeviceType::CPU) {
+    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
+  } else if (src_device_type == DeviceType::CPU &&
+             dst_device_type == DeviceType::GPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
+  } else if (src_device_type == DeviceType::GPU &&
+             dst_device_type == DeviceType::CPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
+  } else {
+    assert(src_device_type == DeviceType::GPU);
+    assert(dst_device_type == DeviceType::GPU);
+    checkCUDA(cudaMemcpy(dst_accessor.ptr,
+                         src_accessor.ptr,
+                         num_bytes,
+                         cudaMemcpyDeviceToDevice));
+  }
+}
+
 template <DataType DT>
 struct CopyTensorAccessorW {
   GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
@@ -45,4 +76,32 @@ GenericTensorAccessorR
       src_accessor.data_type, src_accessor, allocator);
 }
 
+GenericTensorAccessorR
+    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
+                                        Allocator &cpu_allocator) {
+  if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) {
+    throw mk_runtime_error("Allocator must be a CPU allocator");
+  }
+
+  GenericTensorAccessorR cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
+
+GenericTensorAccessorW
+    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
+                                        Allocator &cpu_allocator) {
+  if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) {
+    throw mk_runtime_error("Allocator must be a CPU allocator");
+  }
+
+  GenericTensorAccessorW cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
@@ -10,7 +10,6 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
                                     nonnegative_int num_dimensions) {
   return legion_dim_t{num_dimensions - ff_dim.value - 1_n};
-  ;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/perf_metrics.cc b/lib/kernels/src/perf_metrics.cc
@@ -15,8 +15,9 @@ PerfMetrics::PerfMetrics(int _train_all,
                          double _start_time_micro,
                          double _current_time_micro)
     : train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss),
-      mse_loss(_mse_loss), rmse_loss(_rmse_loss), mae_loss(_mae_loss),
-      start_time(_start_time_micro), current_time(_current_time_micro) {}
+      sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss), 
+      rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro), 
+      current_time(_current_time_micro) {}
 
 float get_throughput(PerfMetrics const &m) {
   return m.train_all / (m.current_time - m.start_time);