From 74d41d0a0f7fafeb7ff3bcdfdc1604656494ad25 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 4 Feb 2026 11:31:04 -0800
Subject: [PATCH 001/113] Add make_dynamic_open_dataflow_graph_from_pcg.

---
 .../parallel_computation_graph.h              |  6 ++
 .../parallel_computation_graph.cc             | 21 +++++
 ...ake_dynamic_open_dataflow_graph_from_pcg.h | 14 ++++
 ...ke_dynamic_open_dataflow_graph_from_pcg.cc | 77 +++++++++++++++++++
 4 files changed, 118 insertions(+)
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h
 create mode 100644 lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.cc
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 25dc0721cd..3d948ac107 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -54,6 +54,9 @@ std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
 std::unordered_set<parallel_layer_guid_t>
     get_initial_layers(ParallelComputationGraph const &);
 
+std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
+    get_outgoing_tensors(ParallelComputationGraph const &,
+                         parallel_layer_guid_t const &);
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &,
                          parallel_layer_guid_t const &);
@@ -107,6 +110,9 @@ ParallelTensorShape get_parallel_tensor_shape(ParallelComputationGraph const &,
 std::vector<parallel_layer_guid_t>
     topological_ordering(ParallelComputationGraph const &);
 
+std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+    get_parallel_layer_attrs_mapping(ParallelComputationGraph const &pcg);
+
 parallel_layer_guid_t
     get_parallel_layer_by_name(ParallelComputationGraph const &pcg,
                                std::string const &name);
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index f83628b8e1..907dc05620 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -212,6 +212,16 @@ std::unordered_set<parallel_layer_guid_t>
                    [](Node const &n) { return parallel_layer_guid_t{n}; });
 }
 
+std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
+    get_outgoing_tensors(ParallelComputationGraph const &pcg,
+                         parallel_layer_guid_t const &l) {
+  return map_values(get_outgoing_kwarg_dataflow_outputs_for_node(
+                        pcg.raw_graph, l.raw_graph_node),
+                    [](KwargDataflowOutput<TensorSlotName> const &o) {
+                      return parallel_tensor_guid_t{o};
+                    });
+}
+
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &pcg,
                          parallel_layer_guid_t const &l) {
@@ -378,6 +388,17 @@ std::vector<parallel_layer_guid_t>
                    [](Node const &n) { return parallel_layer_guid_t{n}; });
 }
 
+std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+    get_parallel_layer_attrs_mapping(ParallelComputationGraph const &pcg) {
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+      layer_attrs_mapping;
+  for (parallel_layer_guid_t const &layer_guid : get_parallel_layers(pcg)) {
+    layer_attrs_mapping.insert(
+        {layer_guid, get_parallel_layer_attrs(pcg, layer_guid)});
+  }
+  return layer_attrs_mapping;
+}
+
 parallel_layer_guid_t
     get_parallel_layer_by_name(ParallelComputationGraph const &pcg,
                                std::string const &name) {
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h
new file mode 100644
index 0000000000..a71eb558c1
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_PCG_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_PCG_H
+
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph
+    make_dynamic_open_dataflow_graph_from_pcg(ParallelComputationGraph const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.cc
new file mode 100644
index 0000000000..841be27dfd
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.cc
@@ -0,0 +1,77 @@
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/pcg_operator_attrs.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_role.h"
+#include "utils/containers/generate_map.h"
+#include <optional>
+#include <unordered_map>
+#include <utility>
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_pcg(
+    ParallelComputationGraph const &pcg) {
+  DynamicOpenDataflowGraph result = make_empty_dynamic_open_dataflow_graph();
+
+  for (auto const &[layer, attrs] : get_parallel_layer_attrs_mapping(pcg)) {
+    DynamicNodeAttrs result_attrs{
+        /*task_type=*/std::nullopt,
+        /*device_coord=*/std::nullopt,
+        /*mapping=*/std::nullopt,
+        /*op_attrs=*/attrs.op_attrs,
+        /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
+        /*per_device_op_state=*/std::nullopt,
+    };
+
+    std::unordered_map<DynamicTensorSlot, DynamicValueAttrs> result_inputs =
+        transform(get_incoming_tensors(pcg, layer),
+                  [&](TensorSlotName const &slot_name,
+                      parallel_tensor_guid_t const &tensor) {
+                    ParallelTensorAttrs attrs =
+                        get_parallel_tensor_attrs(pcg, tensor);
+                    return std::pair<DynamicTensorSlot, DynamicValueAttrs>{
+                        DynamicTensorSlot{
+                            /*slot_name=*/slot_name,
+                            /*slot_tensor_role=*/std::nullopt,
+                        },
+                        DynamicValueAttrs{
+                            /*tensor_guid=*/dynamic_tensor_guid_t{tensor},
+                            /*parallel_tensor_shape=*/attrs.shape,
+                            /*shard_coord=*/std::nullopt,
+                            /*accessor=*/std::nullopt,
+                            /*role=*/std::nullopt,
+                        },
+                    };
+                  });
+    std::unordered_map<DynamicTensorSlot, DynamicValueAttrs> result_outputs =
+        transform(get_outgoing_tensors(pcg, layer),
+                  [&](TensorSlotName const &slot_name,
+                      parallel_tensor_guid_t const &tensor) {
+                    ParallelTensorAttrs attrs =
+                        get_parallel_tensor_attrs(pcg, tensor);
+                    return std::pair<DynamicTensorSlot, DynamicValueAttrs>{
+                        DynamicTensorSlot{
+                            /*slot_name=*/slot_name,
+                            /*slot_tensor_role=*/std::nullopt,
+                        },
+                        DynamicValueAttrs{
+                            /*tensor_guid=*/dynamic_tensor_guid_t{tensor},
+                            /*parallel_tensor_shape=*/attrs.shape,
+                            /*shard_coord=*/std::nullopt,
+                            /*accessor=*/std::nullopt,
+                            /*role=*/std::nullopt,
+                        },
+                    };
+                  });
+
+    result.invocations.emplace(result_inputs, result_attrs, result_outputs);
+  }
+
+  return result;
+}
+
+} // namespace FlexFlow

From cbe2b4a5d6116635447aa54cc4a2cf5bbe873a15 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 4 Feb 2026 14:10:13 -0800
Subject: [PATCH 002/113] Empty skeleton of the realm-execution backend.

---
 .proj.toml                                    |  7 +++++++
 lib/CMakeLists.txt                            |  1 +
 lib/realm-execution/CMakeLists.txt            | 21 +++++++++++++++++++
 .../parallel_computation_graph_instance.h     | 12 +++++++++++
 .../parallel_computation_graph_instance.cc    |  1 +
 lib/realm-execution/test/CMakeLists.txt       | 15 +++++++++++++
 .../test/src/realm-execution/test_e2e.cc      |  9 ++++++++
 7 files changed, 66 insertions(+)
 create mode 100644 lib/realm-execution/CMakeLists.txt
 create mode 100644 lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
 create mode 100644 lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
 create mode 100644 lib/realm-execution/test/CMakeLists.txt
 create mode 100644 lib/realm-execution/test/src/realm-execution/test_e2e.cc

diff --git a/.proj.toml b/.proj.toml
index 38690f710b..5dbbfbcdd7 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -85,6 +85,13 @@ has-cpu-only-benchmarks = false
 has-cuda-tests = true
 has-cuda-benchmarks = false
 
+[targets.realm-execution]
+type = "lib"
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = true
+has-cuda-benchmarks = false
+
 # [targets.local-pcg-execution]
 # type = "lib"
 # has-cpu-only-tests = true
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 2e71e577c0..cb3bd6d6ae 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -5,6 +5,7 @@ add_subdirectory(op-attrs)
 add_subdirectory(kernels)
 add_subdirectory(local-execution)
 add_subdirectory(local-pcg-execution)
+add_subdirectory(realm-execution)
 add_subdirectory(task-spec)
 add_subdirectory(utils)
 add_subdirectory(ffi)
diff --git a/lib/realm-execution/CMakeLists.txt b/lib/realm-execution/CMakeLists.txt
new file mode 100644
index 0000000000..7a38f70607
--- /dev/null
+++ b/lib/realm-execution/CMakeLists.txt
@@ -0,0 +1,21 @@
+ff_add_library(
+  NAME
+    realm-execution
+  SRC_PATTERNS
+    src/*.cc
+  PUBLIC_INCLUDE
+    include/
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    op-attrs
+    utils
+    kernels
+    task-spec
+    pcg
+    spdlog
+    compiler
+    local-execution
+)
+
+add_subdirectory(test)
diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
new file mode 100644
index 0000000000..58cc5234d9
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PARALLEL_COMPUTATION_GRAPH_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PARALLEL_COMPUTATION_GRAPH_INSTANCE_H
+
+namespace FlexFlow {
+
+struct ParallelComputationGraphInstance {
+  public:
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
new file mode 100644
index 0000000000..a22f4730b7
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -0,0 +1 @@
+#include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
diff --git a/lib/realm-execution/test/CMakeLists.txt b/lib/realm-execution/test/CMakeLists.txt
new file mode 100644
index 0000000000..b3beff42c0
--- /dev/null
+++ b/lib/realm-execution/test/CMakeLists.txt
@@ -0,0 +1,15 @@
+ff_add_test_executable(
+  NAME
+    realm-execution-tests
+  SRC_PATTERNS
+    src/*.cc
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    doctest
+    utils-test-common
+    realm-execution
+    kernels
+    op-attrs
+    task-spec
+)
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
new file mode 100644
index 0000000000..55dfe427d5
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -0,0 +1,9 @@
+#include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training") {
+  }
+}

From 5cb317353cb753b62f5e17898fc4687a3127c508 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 4 Feb 2026 14:25:52 -0800
Subject: [PATCH 003/113] More Realm execution skeleton.

---
 .../parallel_computation_graph_instance.h     | 52 ++++++++++++++++++-
 .../parallel_computation_graph_instance.cc    | 45 ++++++++++++++++
 .../test/src/realm-execution/test_e2e.cc      |  3 +-
 3 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
index 58cc5234d9..b0529761c1 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -1,12 +1,62 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PARALLEL_COMPUTATION_GRAPH_INSTANCE_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PARALLEL_COMPUTATION_GRAPH_INSTANCE_H
 
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/profiling_settings.dtg.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+#include "utils/units/milliseconds_t.h"
+#include <optional>
+
 namespace FlexFlow {
 
 struct ParallelComputationGraphInstance {
-  public:
+public:
+  ParallelComputationGraphInstance(DynamicOpenDataflowGraph,
+                                   Allocator &,
+                                   std::vector<DynamicNodeInvocation> const &,
+                                   OptimizerAttrs const &,
+                                   std::optional<LossAttrs> const &,
+                                   std::optional<GenericTensorAccessorW>);
+  DynamicOpenDataflowGraph const &get_dynamic_dataflow_graph() const;
+  Allocator &get_allocator() const;
+  std::vector<DynamicNodeInvocation> const &get_topological_ordering() const;
+  OptimizerAttrs const &get_optimizer_attrs() const;
+  void update_optimizer_attrs_for_next_iter();
+  std::optional<LossAttrs> const &get_loss_attrs() const;
+  std::optional<GenericTensorAccessorR> get_loss_tensor_accessor() const;
+
+private:
+  DynamicOpenDataflowGraph dataflow_graph;
+  Allocator &allocator;
+  std::vector<DynamicNodeInvocation> topological_ordering;
+  OptimizerAttrs optimizer_attrs;
+  std::optional<LossAttrs> loss_attrs;
+  std::optional<GenericTensorAccessorW> logit_grad_tensor;
 };
 
+ParallelComputationGraphInstance create_parallel_computation_graph_instance(
+    ParallelComputationGraph const &pcg,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<LossAttrs> const &loss_attrs,
+    std::optional<GenericTensorAccessorR> label_tensor,
+    std::optional<dynamic_tensor_guid_t> logit_tensor,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &input_tensors,
+    Allocator &allocator,
+    ProfilingSettings const &profiling_settings,
+    device_handle_t const &device_handle,
+    FFIterationConfig const &iteration_config,
+    device_id_t device_idx);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index a22f4730b7..2f001a2975 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -1 +1,46 @@
 #include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
+#include "pcg/optimizer_attrs.h"
+
+namespace FlexFlow {
+
+ParallelComputationGraphInstance::ParallelComputationGraphInstance(
+    DynamicOpenDataflowGraph dataflow_graph,
+    Allocator &allocator,
+    std::vector<DynamicNodeInvocation> const &topological_ordering,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<LossAttrs> const &loss_attrs,
+    std::optional<GenericTensorAccessorW> logit_grad_tensor)
+    : dataflow_graph(dataflow_graph), allocator(allocator),
+      topological_ordering(topological_ordering),
+      optimizer_attrs(optimizer_attrs), loss_attrs(loss_attrs),
+      logit_grad_tensor(logit_grad_tensor) {}
+
+DynamicOpenDataflowGraph const &
+    ParallelComputationGraphInstance::get_dynamic_dataflow_graph() const {
+  return this->dataflow_graph;
+}
+Allocator &ParallelComputationGraphInstance::get_allocator() const {
+  return this->allocator;
+}
+std::vector<DynamicNodeInvocation> const &
+    ParallelComputationGraphInstance::get_topological_ordering() const {
+  return this->topological_ordering;
+}
+OptimizerAttrs const &
+    ParallelComputationGraphInstance::get_optimizer_attrs() const {
+  return this->optimizer_attrs;
+}
+void ParallelComputationGraphInstance::update_optimizer_attrs_for_next_iter() {
+  this->optimizer_attrs =
+      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
+}
+std::optional<LossAttrs> const &
+    ParallelComputationGraphInstance::get_loss_attrs() const {
+  return this->loss_attrs;
+}
+std::optional<GenericTensorAccessorR>
+    ParallelComputationGraphInstance::get_loss_tensor_accessor() const {
+  return this->logit_grad_tensor;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 55dfe427d5..78a57fb99f 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -4,6 +4,5 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("RealmBackend e2e Training") {
-  }
+  TEST_CASE("RealmBackend e2e Training") {}
 }

From 7280bcaf0f35610ce4a6cc4e4b9abf998dec881b Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 4 Feb 2026 14:59:55 -0800
Subject: [PATCH 004/113] Stub creation.

---
 .../parallel_computation_graph_instance.cc      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index 2f001a2975..29683c4dba 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -1,5 +1,6 @@
 #include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
 #include "pcg/optimizer_attrs.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -43,4 +44,20 @@ std::optional<GenericTensorAccessorR>
   return this->logit_grad_tensor;
 }
 
+ParallelComputationGraphInstance create_parallel_computation_graph_instance(
+    ParallelComputationGraph const &pcg,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<LossAttrs> const &loss_attrs,
+    std::optional<GenericTensorAccessorR> label_tensor,
+    std::optional<dynamic_tensor_guid_t> logit_tensor,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &input_tensors,
+    Allocator &allocator,
+    ProfilingSettings const &profiling_settings,
+    device_handle_t const &device_handle,
+    FFIterationConfig const &iteration_config,
+    device_id_t device_idx) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace FlexFlow

From aeb88fcfd42b99e914b1a2c62c4698ffab376785 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 4 Feb 2026 15:17:00 -0800
Subject: [PATCH 005/113] More passes.

---
 .../parallel_computation_graph_instance.cc    | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index 29683c4dba..8f878c90d8 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -1,5 +1,12 @@
 #include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
+#include "local-execution/device_state_initialization.h"
+#include "local-execution/tensor_allocation.h"
 #include "pcg/optimizer_attrs.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/loss_insertion.h"
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h"
+#include "task-spec/dynamic_graph/pass_expansion.h"
+#include "task-spec/dynamic_graph/update_insertion.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -44,6 +51,15 @@ std::optional<GenericTensorAccessorR>
   return this->logit_grad_tensor;
 }
 
+static GenericTensorAccessorW
+    get_loss_tensor_accessor(DynamicOpenDataflowGraph const &dg,
+                             DynamicValueAttrs const &value) {
+  return find_output_tensor(dg, value.tensor_guid, value.role)
+      .value()
+      .second.accessor.value()
+      .get<GenericTensorAccessorW>();
+}
+
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     ParallelComputationGraph const &pcg,
     OptimizerAttrs const &optimizer_attrs,
@@ -57,6 +73,36 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     device_handle_t const &device_handle,
     FFIterationConfig const &iteration_config,
     device_id_t device_idx) {
+
+  DynamicOpenDataflowGraph dg = make_dynamic_open_dataflow_graph_from_pcg(pcg);
+  dg = perform_pass_expansion(dg);
+
+  std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> inputs =
+      input_tensors;
+  std::optional<DynamicValueAttrs> logit_grad_value;
+  if (loss_attrs) {
+    auto [dg2, label_v, logit_grad_v] = perform_loss_insertion(
+        dg, assert_unwrap(loss_attrs), assert_unwrap(logit_tensor));
+    dg = dg2;
+    logit_grad_value = logit_grad_v;
+    inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
+  }
+
+  dg = perform_update_insertion(dg, optimizer_attrs);
+  dg = perform_tensor_allocation(dg, inputs, allocator);
+
+  std::optional<GenericTensorAccessorW> logit_grad_tensor =
+      transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
+        return get_loss_tensor_accessor(dg, lgv);
+      });
+
+  dg = perform_device_state_initialization(dg,
+                                           allocator,
+                                           profiling_settings,
+                                           device_handle,
+                                           iteration_config,
+                                           optimizer_attrs,
+                                           device_idx);
   NOT_IMPLEMENTED();
 }
 

From 44cfb8ccfa00e890682eee31a03625f96f7ca0f1 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 4 Feb 2026 16:39:30 -0800
Subject: [PATCH 006/113] Add Realm manager and test it.

---
 lib/realm-execution/CMakeLists.txt            | 11 ++++----
 .../include/realm-execution/realm_manager.h   | 27 +++++++++++++++++++
 .../src/realm-execution/realm_manager.cc      | 22 +++++++++++++++
 .../test/src/realm-execution/test_e2e.cc      | 10 ++++++-
 4 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/realm_manager.h
 create mode 100644 lib/realm-execution/src/realm-execution/realm_manager.cc

diff --git a/lib/realm-execution/CMakeLists.txt b/lib/realm-execution/CMakeLists.txt
index 7a38f70607..0a1b681b8d 100644
--- a/lib/realm-execution/CMakeLists.txt
+++ b/lib/realm-execution/CMakeLists.txt
@@ -8,14 +8,15 @@ ff_add_library(
   PRIVATE_INCLUDE
     src/
   DEPS
-    op-attrs
-    utils
+    compiler
     kernels
-    task-spec
+    local-execution
+    op-attrs
     pcg
+    realm
     spdlog
-    compiler
-    local-execution
+    task-spec
+    utils
 )
 
 add_subdirectory(test)
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
new file mode 100644
index 0000000000..a08668e6cc
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_MANAGER_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_MANAGER_H
+
+#include "realm.h"
+
+namespace FlexFlow {
+
+struct RealmManager {
+public:
+  RealmManager(int *argc, char ***argv);
+
+  RealmManager() = delete;
+  RealmManager(RealmManager const &) = delete;
+  RealmManager(RealmManager &&) = delete;
+
+  Realm::Runtime get_runtime();
+  void shutdown();
+  int wait_for_shutdown();
+
+private:
+  Realm::Runtime runtime;
+  Realm::Event last_event = Realm::Event::NO_EVENT;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
new file mode 100644
index 0000000000..5a085bc04b
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -0,0 +1,22 @@
+#include "realm-execution/realm_manager.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+RealmManager::RealmManager(int *argc, char ***argv) {
+  bool ok = this->runtime.init(argc, argv);
+  ASSERT(ok);
+}
+
+Realm::Runtime RealmManager::get_runtime() {
+  return this->runtime;
+}
+
+void RealmManager::shutdown() {
+  this->runtime.shutdown(this->last_event);
+}
+
+int RealmManager::wait_for_shutdown() {
+  return this->runtime.wait_for_shutdown();
+}
+} // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 78a57fb99f..947a02e6be 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -1,8 +1,16 @@
 #include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
+#include "realm-execution/realm_manager.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("RealmBackend e2e Training") {}
+  TEST_CASE("RealmBackend e2e Training") {
+    char fake_executable_name[] = "fake_executable_name";
+    std::vector<char *> fake_args{fake_executable_name};
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+    RealmManager manager(&fake_argc, &fake_argv);
+    manager.shutdown();
+  }
 }

From c8c3119f8058e271bd6350c223d6d04ef58d3700 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 4 Feb 2026 16:45:57 -0800
Subject: [PATCH 007/113] Do not expose raw runtime and properly wait in test.

---
 lib/realm-execution/include/realm-execution/realm_manager.h | 1 -
 lib/realm-execution/src/realm-execution/realm_manager.cc    | 5 +----
 lib/realm-execution/test/src/realm-execution/test_e2e.cc    | 2 ++
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index a08668e6cc..f9fa9f7de7 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -13,7 +13,6 @@ struct RealmManager {
   RealmManager(RealmManager const &) = delete;
   RealmManager(RealmManager &&) = delete;
 
-  Realm::Runtime get_runtime();
   void shutdown();
   int wait_for_shutdown();
 
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 5a085bc04b..014a16718a 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -8,10 +8,6 @@ RealmManager::RealmManager(int *argc, char ***argv) {
   ASSERT(ok);
 }
 
-Realm::Runtime RealmManager::get_runtime() {
-  return this->runtime;
-}
-
 void RealmManager::shutdown() {
   this->runtime.shutdown(this->last_event);
 }
@@ -19,4 +15,5 @@ void RealmManager::shutdown() {
 int RealmManager::wait_for_shutdown() {
   return this->runtime.wait_for_shutdown();
 }
+
 } // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 947a02e6be..b88807e079 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -12,5 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     char **fake_argv = fake_args.data();
     RealmManager manager(&fake_argc, &fake_argv);
     manager.shutdown();
+    int result = manager.wait_for_shutdown();
+    ASSERT(result == 0);
   }
 }

From 4aa2a1165f66328db2d8baa03bca7a28e547ace1 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 4 Feb 2026 17:11:02 -0800
Subject: [PATCH 008/113] Sketch more Realm manager APIs.

---
 .../parallel_computation_graph_instance.h     | 13 ++++++------
 .../include/realm-execution/realm_manager.h   |  8 +++++++
 .../parallel_computation_graph_instance.cc    | 21 +++++++++----------
 .../src/realm-execution/realm_manager.cc      | 11 ++++++++++
 4 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
index b0529761c1..4ba77a7925 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -9,6 +9,7 @@
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "realm-execution/realm_manager.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
@@ -20,8 +21,8 @@ namespace FlexFlow {
 
 struct ParallelComputationGraphInstance {
 public:
-  ParallelComputationGraphInstance(DynamicOpenDataflowGraph,
-                                   Allocator &,
+  ParallelComputationGraphInstance(RealmManager &,
+                                   DynamicOpenDataflowGraph,
                                    std::vector<DynamicNodeInvocation> const &,
                                    OptimizerAttrs const &,
                                    std::optional<LossAttrs> const &,
@@ -35,8 +36,8 @@ struct ParallelComputationGraphInstance {
   std::optional<GenericTensorAccessorR> get_loss_tensor_accessor() const;
 
 private:
+  RealmManager &realm;
   DynamicOpenDataflowGraph dataflow_graph;
-  Allocator &allocator;
   std::vector<DynamicNodeInvocation> topological_ordering;
   OptimizerAttrs optimizer_attrs;
   std::optional<LossAttrs> loss_attrs;
@@ -44,6 +45,7 @@ struct ParallelComputationGraphInstance {
 };
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
+    RealmManager &realm,
     ParallelComputationGraph const &pcg,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
@@ -51,11 +53,8 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     std::optional<dynamic_tensor_guid_t> logit_tensor,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
-    Allocator &allocator,
     ProfilingSettings const &profiling_settings,
-    device_handle_t const &device_handle,
-    FFIterationConfig const &iteration_config,
-    device_id_t device_idx);
+    FFIterationConfig const &iteration_config);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index f9fa9f7de7..9261bc91f4 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -1,6 +1,9 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_MANAGER_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_MANAGER_H
 
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "pcg/device_id_t.dtg.h"
 #include "realm.h"
 
 namespace FlexFlow {
@@ -16,6 +19,11 @@ struct RealmManager {
   void shutdown();
   int wait_for_shutdown();
 
+  Allocator &get_current_device_allocator() const;
+
+  device_handle_t const &get_current_device_handle() const;
+  device_id_t const &get_current_device_idx() const;
+
 private:
   Realm::Runtime runtime;
   Realm::Event last_event = Realm::Event::NO_EVENT;
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index 8f878c90d8..64c9da2f4c 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -12,13 +12,13 @@
 namespace FlexFlow {
 
 ParallelComputationGraphInstance::ParallelComputationGraphInstance(
+    RealmManager &realm,
     DynamicOpenDataflowGraph dataflow_graph,
-    Allocator &allocator,
     std::vector<DynamicNodeInvocation> const &topological_ordering,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
     std::optional<GenericTensorAccessorW> logit_grad_tensor)
-    : dataflow_graph(dataflow_graph), allocator(allocator),
+    : realm(realm), dataflow_graph(dataflow_graph),
       topological_ordering(topological_ordering),
       optimizer_attrs(optimizer_attrs), loss_attrs(loss_attrs),
       logit_grad_tensor(logit_grad_tensor) {}
@@ -28,7 +28,7 @@ DynamicOpenDataflowGraph const &
   return this->dataflow_graph;
 }
 Allocator &ParallelComputationGraphInstance::get_allocator() const {
-  return this->allocator;
+  return this->realm.get_current_device_allocator();
 }
 std::vector<DynamicNodeInvocation> const &
     ParallelComputationGraphInstance::get_topological_ordering() const {
@@ -61,6 +61,7 @@ static GenericTensorAccessorW
 }
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
+    RealmManager &realm,
     ParallelComputationGraph const &pcg,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
@@ -68,11 +69,8 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     std::optional<dynamic_tensor_guid_t> logit_tensor,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
-    Allocator &allocator,
     ProfilingSettings const &profiling_settings,
-    device_handle_t const &device_handle,
-    FFIterationConfig const &iteration_config,
-    device_id_t device_idx) {
+    FFIterationConfig const &iteration_config) {
 
   DynamicOpenDataflowGraph dg = make_dynamic_open_dataflow_graph_from_pcg(pcg);
   dg = perform_pass_expansion(dg);
@@ -89,7 +87,8 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
   }
 
   dg = perform_update_insertion(dg, optimizer_attrs);
-  dg = perform_tensor_allocation(dg, inputs, allocator);
+  dg = perform_tensor_allocation(
+      dg, inputs, realm.get_current_device_allocator());
 
   std::optional<GenericTensorAccessorW> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
@@ -97,12 +96,12 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
       });
 
   dg = perform_device_state_initialization(dg,
-                                           allocator,
+                                           realm.get_current_device_allocator(),
                                            profiling_settings,
-                                           device_handle,
+                                           realm.get_current_device_handle(),
                                            iteration_config,
                                            optimizer_attrs,
-                                           device_idx);
+                                           realm.get_current_device_idx());
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 014a16718a..b136b4c379 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -16,4 +16,15 @@ int RealmManager::wait_for_shutdown() {
   return this->runtime.wait_for_shutdown();
 }
 
+Allocator &RealmManager::get_current_device_allocator() const {
+  NOT_IMPLEMENTED();
+}
+
+device_handle_t const &RealmManager::get_current_device_handle() const {
+  NOT_IMPLEMENTED();
+}
+device_id_t const &RealmManager::get_current_device_idx() const {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace FlexFlow

From 820fc1c064dd43022c14c2acfdefb9900ac5c7f9 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 5 Feb 2026 09:57:00 -0800
Subject: [PATCH 009/113] Add controller functionality.

---
 .../include/realm-execution/realm_manager.h   | 17 ++++--
 .../src/realm-execution/realm_manager.cc      | 60 +++++++++++++++++--
 .../test/src/realm-execution/test_e2e.cc      |  4 +-
 3 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 9261bc91f4..497a1f3958 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -11,22 +11,31 @@ namespace FlexFlow {
 struct RealmManager {
 public:
   RealmManager(int *argc, char ***argv);
+  ~RealmManager();
 
   RealmManager() = delete;
   RealmManager(RealmManager const &) = delete;
   RealmManager(RealmManager &&) = delete;
 
-  void shutdown();
-  int wait_for_shutdown();
+  Realm::Event start_controller(void (*thunk)(RealmManager &));
 
+  // Current device context
   Allocator &get_current_device_allocator() const;
-
   device_handle_t const &get_current_device_handle() const;
   device_id_t const &get_current_device_idx() const;
 
+private:
+  RealmManager(void const *, size_t, void const *, size_t, Realm::Processor);
+
+  [[nodiscard]] Realm::Event merge_outstanding_events();
+
+  static void controller_task_wrapper(
+      void const *, size_t, void const *, size_t, Realm::Processor);
+
 private:
   Realm::Runtime runtime;
-  Realm::Event last_event = Realm::Event::NO_EVENT;
+  std::vector<Realm::Event> outstanding_events;
+  bool is_root_runtime;
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index b136b4c379..acc11936c7 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -3,17 +3,48 @@
 
 namespace FlexFlow {
 
-RealmManager::RealmManager(int *argc, char ***argv) {
+RealmManager::RealmManager(int *argc, char ***argv) : is_root_runtime(true) {
   bool ok = this->runtime.init(argc, argv);
   ASSERT(ok);
 }
 
-void RealmManager::shutdown() {
-  this->runtime.shutdown(this->last_event);
+RealmManager::RealmManager(void const *args,
+                           size_t arglen,
+                           void const *userdata,
+                           size_t userdatalen,
+                           Realm::Processor proc)
+    : runtime(Realm::Runtime::get_runtime()), is_root_runtime(false) {}
+
+RealmManager::~RealmManager() {
+  Realm::Event outstanding = this->merge_outstanding_events();
+  if (is_root_runtime) {
+    this->runtime.shutdown(outstanding);
+    this->runtime.wait_for_shutdown();
+  } else {
+    outstanding.wait();
+  }
 }
 
-int RealmManager::wait_for_shutdown() {
-  return this->runtime.wait_for_shutdown();
+Realm::Event RealmManager::start_controller(void (*thunk)(RealmManager &)) {
+  constexpr int CONTROLLER_TASK_ID = Realm::Processor::TASK_ID_FIRST_AVAILABLE;
+  Realm::Event task_ready = Realm::Processor::register_task_by_kind(
+      Realm::Processor::LOC_PROC,
+      /*global=*/false,
+      CONTROLLER_TASK_ID,
+      Realm::CodeDescriptor(RealmManager::controller_task_wrapper),
+      Realm::ProfilingRequestSet(),
+      &thunk,
+      sizeof(thunk));
+
+  Realm::Processor target_proc =
+      Realm::Machine::ProcessorQuery(Realm::Machine::get_machine())
+          .only_kind(Realm::Processor::LOC_PROC)
+          .first();
+
+  Realm::Event task_complete = this->runtime.collective_spawn(
+      target_proc, CONTROLLER_TASK_ID, &thunk, sizeof(thunk), task_ready);
+  this->outstanding_events.push_back(task_complete);
+  return task_complete;
 }
 
 Allocator &RealmManager::get_current_device_allocator() const {
@@ -27,4 +58,23 @@ device_id_t const &RealmManager::get_current_device_idx() const {
   NOT_IMPLEMENTED();
 }
 
+Realm::Event RealmManager::merge_outstanding_events() {
+  Realm::Event result = Realm::Event::merge_events(this->outstanding_events);
+  this->outstanding_events.clear();
+  return result;
+}
+
+void RealmManager::controller_task_wrapper(void const *args,
+                                           size_t arglen,
+                                           void const *userdata,
+                                           size_t userlen,
+                                           Realm::Processor proc) {
+  assert(arglen == sizeof(void (*)(RealmManager &)));
+  void (*thunk)(RealmManager &) =
+      *reinterpret_cast<void (**)(RealmManager &)>(const_cast<void *>(args));
+
+  RealmManager manager(args, arglen, userdata, userlen, proc);
+  thunk(manager);
+}
+
 } // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index b88807e079..f09951e73c 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -11,8 +11,6 @@ TEST_SUITE(FF_TEST_SUITE) {
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
     RealmManager manager(&fake_argc, &fake_argv);
-    manager.shutdown();
-    int result = manager.wait_for_shutdown();
-    ASSERT(result == 0);
+    manager.start_controller([](RealmManager &manager) {});
   }
 }

From 66603715fd132e4b1d0ef326d8684ed7bcadfef0 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 5 Feb 2026 12:29:19 -0800
Subject: [PATCH 010/113] Fix Realm tests.

---
 .flake/pkgs/legion.nix                        | 48 -------------------
 .flake/pkgs/realm.nix                         | 44 +++++++++++++++++
 flake.nix                                     | 21 ++++----
 lib/realm-execution/CMakeLists.txt            |  2 +-
 .../src/realm-execution/realm_manager.cc      |  2 +-
 .../test/src/realm-execution/realm_manager.cc | 22 +++++++++
 6 files changed, 78 insertions(+), 61 deletions(-)
 delete mode 100644 .flake/pkgs/legion.nix
 create mode 100644 .flake/pkgs/realm.nix
 create mode 100644 lib/realm-execution/test/src/realm-execution/realm_manager.cc

diff --git a/.flake/pkgs/legion.nix b/.flake/pkgs/legion.nix
deleted file mode 100644
index 361a66c4ff..0000000000
--- a/.flake/pkgs/legion.nix
+++ /dev/null
@@ -1,48 +0,0 @@
-{ lib
-, stdenv
-, fetchFromGitLab
-, cmake
-, cudaPackages ? { }
-, cudaCapabilities ? [ "60" "70" "80" "86" ]
-, maxDim ? 5
-}:
-
-# from https://codeberg.org/Uli/nix-things/src/commit/776519e382c81b136c1d0b10d8c7b52b4acb9192/overlays/cq/python/libclang-python.nix
-
-let 
-  cmakeFlag = x: if x then "1" else "0";
-
-  inherit (cudaPackages) cudatoolkit;
-in
-
-stdenv.mkDerivation rec {
-  pname = "legion";
-  version = "2025-01-06";
-
-  src = fetchFromGitLab {
-    owner = "StanfordLegion";
-    repo = "legion";
-    rev = "7be1abd0207eb1126c7629b16d1123fa6f58ce9d";
-    sha256 = "sha256-gTjnGYYTQwTsrV1WcY0qqpTrlwbzAPcndurRy6XnG8A=";
-  };
-
-  nativeBuildInputs = [
-    cmake
-  ];
-
-  cmakeFlags = [
-    "-DLegion_USE_CUDA=1"
-    "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}"
-    "-DLegion_MAX_DIM=${toString maxDim}"
-  ];
-
-  buildInputs = [ 
-    cudatoolkit
-  ];
-
-  meta = with lib; {
-    description = "Legion is a parallel programming model for distributed, heterogeneous machines";
-    homepage = "https://legion.stanford.edu/";
-    license = licenses.asl20;
-  };
-}
diff --git a/.flake/pkgs/realm.nix b/.flake/pkgs/realm.nix
new file mode 100644
index 0000000000..1249c0ae28
--- /dev/null
+++ b/.flake/pkgs/realm.nix
@@ -0,0 +1,44 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, cmake
+, cudaPackages ? { }
+, maxDim ? 5
+}:
+
+let
+  inherit (cudaPackages) cudatoolkit;
+in
+
+stdenv.mkDerivation rec {
+  pname = "realm";
+  version = "2025-01-06";
+
+  # This version is compatible with Legion 7be1abd0207eb1126c7629b16d1123fa6f58ce9d
+  src = fetchFromGitHub {
+    owner = "StanfordLegion";
+    repo = "realm";
+    rev = "0ef7edc8c012d4ab6a50805c044cec8a8edeae33";
+    sha256 = "sha256-57/a1lAgs+ajpRn0y0Lk1gP5nKt+N08WW0DIJP4vdho=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+  ];
+
+  cmakeFlags = [
+    "-DBUILD_SHARED_LIBS=ON"
+    "-DREALM_ENABLE_CUDA=ON"
+    "-DREALM_MAX_DIM=${toString maxDim}"
+  ];
+
+  buildInputs = [
+    cudatoolkit
+  ];
+
+  meta = with lib; {
+    description = "Realm is a distributed, event–based tasking runtime for building high-performance applications that span clusters of CPUs, GPUs, and other accelerators";
+    homepage = "https://legion.stanford.edu/realm";
+    license = licenses.asl20;
+  };
+}
diff --git a/flake.nix b/flake.nix
index 6ccd5616cd..dad0e2fc32 100644
--- a/flake.nix
+++ b/flake.nix
@@ -30,8 +30,8 @@
     };
   };
 
-  outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: 
-    let 
+  outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system:
+    let
       pkgs = import nixpkgs {
         inherit system;
         config.allowUnfree = true;
@@ -41,21 +41,21 @@
       mkShell = attrs: pkgs.mkShell.override {
         stdenv = pkgs.cudaPackages.backendStdenv;
       } (attrs // {
-        hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch 
-                                    # signed overflows due to the signedoverflow hardening setting. 
-                                    # for more details, see the following (long-running) nixpkgs github issues: 
+        hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch
+                                    # signed overflows due to the signedoverflow hardening setting.
+                                    # for more details, see the following (long-running) nixpkgs github issues:
                                     # - https://github.com/NixOS/nixpkgs/issues/18995
                                     # - https://github.com/NixOS/nixpkgs/issues/60919
       });
 
       proj = proj-repo.packages.${system}.proj;
-    in 
+    in
     {
       packages = rec {
         libdwarf-lite = pkgs.callPackage ./.flake/pkgs/libdwarf-lite.nix { };
         cpptrace = pkgs.callPackage ./.flake/pkgs/cpptrace.nix { inherit libdwarf-lite; };
         libassert = pkgs.callPackage ./.flake/pkgs/libassert.nix { inherit cpptrace; };
-        legion = pkgs.callPackage ./.flake/pkgs/legion.nix { };
+        realm = pkgs.callPackage ./.flake/pkgs/realm.nix { };
         bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { };
         ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; };
         hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { };
@@ -83,8 +83,7 @@
           shellHook = ''
             export PATH="$HOME/ff/.scripts/:$PATH"
             export RC_PARAMS="max_discard_ratio=100"
-            export CMAKE_FLAGS="-DFF_USE_EXTERNAL_LEGION=ON \
-                                -DFF_USE_EXTERNAL_NCCL=ON \
+            export CMAKE_FLAGS="-DFF_USE_EXTERNAL_NCCL=ON \
                                 -DFF_USE_EXTERNAL_JSON=ON \
                                 -DFF_USE_EXTERNAL_FMT=ON \
                                 -DFF_USE_EXTERNAL_SPDLOG=ON \
@@ -94,7 +93,7 @@
                                 -DFF_USE_EXTERNAL_GBENCHMARK=ON \
                                 -DFF_USE_EXTERNAL_LIBASSERT=ON"
           '';
-          
+
           buildInputs = builtins.concatLists [
             (with pkgs; [
               zlib
@@ -125,7 +124,7 @@
             ])
             (with self.packages.${system}; [
               libassert
-              legion
+              realm
               rapidcheckFull
               doctest
             ])
diff --git a/lib/realm-execution/CMakeLists.txt b/lib/realm-execution/CMakeLists.txt
index 0a1b681b8d..08676525e1 100644
--- a/lib/realm-execution/CMakeLists.txt
+++ b/lib/realm-execution/CMakeLists.txt
@@ -13,10 +13,10 @@ ff_add_library(
     local-execution
     op-attrs
     pcg
-    realm
     spdlog
     task-spec
     utils
+    Realm::Realm
 )
 
 add_subdirectory(test)
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index acc11936c7..33e7ca252e 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -71,7 +71,7 @@ void RealmManager::controller_task_wrapper(void const *args,
                                            Realm::Processor proc) {
   assert(arglen == sizeof(void (*)(RealmManager &)));
   void (*thunk)(RealmManager &) =
-      *reinterpret_cast<void (**)(RealmManager &)>(const_cast<void *>(args));
+      *reinterpret_cast<void (*const *)(RealmManager &)>(args);
 
   RealmManager manager(args, arglen, userdata, userlen, proc);
   thunk(manager);
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
new file mode 100644
index 0000000000..880268c018
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -0,0 +1,22 @@
+#include "realm-execution/realm_manager.h"
+#include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmManager") {
+    // Construct some fake command line for our test
+    char fake_executable_name[] = "fake_executable_name";
+    std::vector<char *> fake_args{fake_executable_name};
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    // Initialize Realm
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    // Launch a controller and wait on it
+    Realm::Event event = manager.start_controller([](RealmManager &manager) {});
+    event.wait();
+  }
+}

From 2782de01c8317fa8ce0d56a8bf320ac7c6fdd54c Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 5 Feb 2026 12:33:22 -0800
Subject: [PATCH 011/113] Support passing closure arguments to controllers.

---
 .../include/realm-execution/realm_manager.h              | 3 ++-
 lib/realm-execution/src/realm-execution/realm_manager.cc | 9 +++++----
 .../test/src/realm-execution/realm_manager.cc            | 7 +++++--
 lib/realm-execution/test/src/realm-execution/test_e2e.cc | 3 ++-
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 497a1f3958..88cc11f744 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -17,7 +17,8 @@ struct RealmManager {
   RealmManager(RealmManager const &) = delete;
   RealmManager(RealmManager &&) = delete;
 
-  Realm::Event start_controller(void (*thunk)(RealmManager &));
+  [[nodiscard]] Realm::Event
+      start_controller(std::function<void(RealmManager &)>);
 
   // Current device context
   Allocator &get_current_device_allocator() const;
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 33e7ca252e..0ccf3f4116 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -25,7 +25,8 @@ RealmManager::~RealmManager() {
   }
 }
 
-Realm::Event RealmManager::start_controller(void (*thunk)(RealmManager &)) {
+Realm::Event
+    RealmManager::start_controller(std::function<void(RealmManager &)> thunk) {
   constexpr int CONTROLLER_TASK_ID = Realm::Processor::TASK_ID_FIRST_AVAILABLE;
   Realm::Event task_ready = Realm::Processor::register_task_by_kind(
       Realm::Processor::LOC_PROC,
@@ -69,9 +70,9 @@ void RealmManager::controller_task_wrapper(void const *args,
                                            void const *userdata,
                                            size_t userlen,
                                            Realm::Processor proc) {
-  assert(arglen == sizeof(void (*)(RealmManager &)));
-  void (*thunk)(RealmManager &) =
-      *reinterpret_cast<void (*const *)(RealmManager &)>(args);
+  ASSERT(arglen == sizeof(std::function<void(RealmManager &)>));
+  std::function<void(RealmManager &)> thunk =
+      *reinterpret_cast<std::function<void(RealmManager &)> const *>(args);
 
   RealmManager manager(args, arglen, userdata, userlen, proc);
   thunk(manager);
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index 880268c018..16b5338881 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -15,8 +15,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     // Initialize Realm
     RealmManager manager(&fake_argc, &fake_argv);
 
-    // Launch a controller and wait on it
-    Realm::Event event = manager.start_controller([](RealmManager &manager) {});
+    // Launch a controller
+    int some_data = 123;
+    Realm::Event event = manager.start_controller(
+        [&](RealmManager &manager) { ASSERT(some_data == 123); });
+    // Need to block on the completion of the event to ensure we don't race
     event.wait();
   }
 }
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index f09951e73c..623b8318e6 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -11,6 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
     RealmManager manager(&fake_argc, &fake_argv);
-    manager.start_controller([](RealmManager &manager) {});
+    Realm::Event event = manager.start_controller([](RealmManager &manager) {});
+    event.wait();
   }
 }

From 2fc4fa99db51709fabbc8b503a7d075f23256be8 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 5 Feb 2026 14:22:44 -0800
Subject: [PATCH 012/113] Move task IDs into Realm and assign IDs to remaining
 tasks.

---
 .../realm-execution}/task_id_t.dtg.toml       |   5 +-
 .../include/realm-execution/task_id_t.h       |  28 ++
 .../src/realm-execution/task_id_t.cc          | 192 ++++++++++++++
 .../include/task-spec/ops/impl/dropout.h      |   1 -
 .../task-spec/ops/op_task_id_t.dtg.toml       |  18 --
 .../task_id_with_noop_default_t.dtg.toml      |  28 --
 .../task-spec/task_id_with_noop_default_t.h   |  28 --
 .../task-spec/task_id_with_noop_default_t.cc  | 243 ------------------
 8 files changed, 221 insertions(+), 322 deletions(-)
 rename lib/{task-spec/include/task-spec => realm-execution/include/realm-execution}/task_id_t.dtg.toml (98%)
 create mode 100644 lib/realm-execution/include/realm-execution/task_id_t.h
 create mode 100644 lib/realm-execution/src/realm-execution/task_id_t.cc
 delete mode 100644 lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml
 delete mode 100644 lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml
 delete mode 100644 lib/task-spec/include/task-spec/task_id_with_noop_default_t.h
 delete mode 100644 lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc

diff --git a/lib/task-spec/include/task-spec/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/task_id_t.dtg.toml
similarity index 98%
rename from lib/task-spec/include/task-spec/task_id_t.dtg.toml
rename to lib/realm-execution/include/realm-execution/task_id_t.dtg.toml
index ce2de52d40..0336bc81a4 100644
--- a/lib/task-spec/include/task-spec/task_id_t.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/task_id_t.dtg.toml
@@ -9,10 +9,7 @@ features = [
 ]
 
 [[values]]
-name = "TOP_LEVEL_TASK_ID"
-
-[[values]]
-name = "FF_INIT_TASK_ID"
+name = "CONTROLLER_TASK_ID"
 
 [[values]]
 name = "IMAGE_INIT_TASK_ID"
diff --git a/lib/realm-execution/include/realm-execution/task_id_t.h b/lib/realm-execution/include/realm-execution/task_id_t.h
new file mode 100644
index 0000000000..af20dc27f6
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/task_id_t.h
@@ -0,0 +1,28 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASK_ID_T_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASK_ID_T_H
+
+#include "op-attrs/pcg_operator_attrs.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/task_id_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+std::optional<task_id_t>
+    get_task_id_for_op(DynamicNodeInvocation const &,
+                       std::optional<OptimizerAttrs> const &);
+
+std::optional<task_id_t>
+    get_init_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t> get_fwd_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t> get_bwd_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t>
+    get_update_task_id_for_optimizer_attrs(OptimizerAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/task_id_t.cc b/lib/realm-execution/src/realm-execution/task_id_t.cc
new file mode 100644
index 0000000000..94b5fb5b24
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/task_id_t.cc
@@ -0,0 +1,192 @@
+#include "realm-execution/task_id_t.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_task_type.dtg.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+std::optional<task_id_t>
+    get_task_id_for_op(DynamicNodeInvocation const &invocation,
+                       std::optional<OptimizerAttrs> const &optimizer_attrs) {
+  DynamicTaskType task_type = invocation.node_attrs.task_type.value();
+  switch (task_type) {
+    case DynamicTaskType::FWD:
+      return get_fwd_task_id_for_op_attrs(
+          invocation.node_attrs.op_attrs.value());
+    case DynamicTaskType::BWD:
+      return get_bwd_task_id_for_op_attrs(
+          invocation.node_attrs.op_attrs.value());
+    case DynamicTaskType::UPD:
+      return get_update_task_id_for_optimizer_attrs(optimizer_attrs.value());
+    case DynamicTaskType::LOSS:
+      return task_id_t::LOSS_BWD_TASK_ID;
+    default:
+      PANIC("Unhandled DynamicTaskType", task_type);
+  }
+}
+
+std::optional<task_id_t>
+    get_init_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) { return std::nullopt; },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_INIT_TASK_ID; },
+      [](BroadcastAttrs const &) { return std::nullopt; },
+      [](CastAttrs const &) { return std::nullopt; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_INIT_TASK_ID; },
+      [](ConcatAttrs const &) { return std::nullopt; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_INIT_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_INIT_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_INIT_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_INIT_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return std::nullopt; },
+      [](FlatAttrs const &) { return std::nullopt; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_INIT_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_INIT_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_INIT_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_INIT_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_INIT_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_INIT_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_INIT_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_INIT_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_INIT_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return std::nullopt; },
+      [](ReverseAttrs const &) { return std::nullopt; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_INIT_TASK_ID; },
+      [](SplitAttrs const &) { return std::nullopt; },
+      [](TopKAttrs const &) { return std::nullopt; },
+      [](TransposeAttrs const &) { return std::nullopt; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t>
+    get_fwd_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) {
+        return task_id_t::BATCHMATMUL_FWD_TASK_ID;
+      },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_FWD_TASK_ID; },
+      [](BroadcastAttrs const &) { return task_id_t::BROADCAST_FWD_TASK_ID; },
+      [](CastAttrs const &) { return task_id_t::CAST_FWD_TASK_ID; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_FWD_TASK_ID; },
+      [](ConcatAttrs const &) { return task_id_t::CONCAT_FWD_TASK_ID; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_FWD_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_FWD_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_FWD_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_FWD_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return task_id_t::EMBED_FWD_TASK_ID; },
+      [](FlatAttrs const &) { return task_id_t::FLAT_FWD_TASK_ID; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_FWD_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_FWD_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_FWD_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_FWD_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_FWD_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_FWD_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_FWD_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_FWD_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_FWD_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return task_id_t::RESHAPE_FWD_TASK_ID; },
+      [](ReverseAttrs const &) { return task_id_t::REVERSE_FWD_TASK_ID; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_FWD_TASK_ID; },
+      [](SplitAttrs const &) { return task_id_t::SPLIT_FWD_TASK_ID; },
+      [](TopKAttrs const &) { return task_id_t::TOPK_FWD_TASK_ID; },
+      [](TransposeAttrs const &) { return task_id_t::TRANSPOSE_FWD_TASK_ID; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t>
+    get_bwd_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) {
+        return task_id_t::BATCHMATMUL_BWD_TASK_ID;
+      },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_BWD_TASK_ID; },
+      [](BroadcastAttrs const &) { return task_id_t::BROADCAST_BWD_TASK_ID; },
+      [](CastAttrs const &) { return task_id_t::CAST_BWD_TASK_ID; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_BWD_TASK_ID; },
+      [](ConcatAttrs const &) { return task_id_t::CONCAT_BWD_TASK_ID; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_BWD_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_BWD_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_BWD_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_BWD_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return task_id_t::EMBED_BWD_TASK_ID; },
+      [](FlatAttrs const &) { return task_id_t::FLAT_BWD_TASK_ID; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_BWD_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_BWD_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_BWD_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_BWD_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_BWD_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_BWD_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_BWD_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_BWD_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_BWD_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return task_id_t::RESHAPE_BWD_TASK_ID; },
+      [](ReverseAttrs const &) { return task_id_t::REVERSE_BWD_TASK_ID; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_BWD_TASK_ID; },
+      [](SplitAttrs const &) { return task_id_t::SPLIT_BWD_TASK_ID; },
+      [](TopKAttrs const &) { return task_id_t::TOPK_BWD_TASK_ID; },
+      [](TransposeAttrs const &) { return task_id_t::TRANSPOSE_BWD_TASK_ID; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t> get_update_task_id_for_optimizer_attrs(
+    OptimizerAttrs const &optimizer_attrs) {
+
+  return optimizer_attrs.visit<std::optional<task_id_t>>(overload{
+      [](SGDOptimizerAttrs const &) { return task_id_t::SGD_UPD_NCCL_TASK_ID; },
+      [](AdamOptimizerAttrs const &) {
+        return task_id_t::ADAM_UPD_NCCL_TASK_ID;
+      },
+  });
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/include/task-spec/ops/impl/dropout.h b/lib/task-spec/include/task-spec/ops/impl/dropout.h
index a7b382ce62..192f2f8244 100644
--- a/lib/task-spec/include/task-spec/ops/impl/dropout.h
+++ b/lib/task-spec/include/task-spec/ops/impl/dropout.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_IMPL_DROPOUT_H
 
 #include "op-attrs/ops/dropout_attrs.dtg.h"
-#include "task-spec/task_id_t.dtg.h"
 #include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml b/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml
deleted file mode 100644
index 557da6cf4c..0000000000
--- a/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-namespace = "FlexFlow"
-name = "op_task_id_t"
-type = "enum"
-features = [
-  "hash",
-  "json",
-  "rapidcheck",
-  "fmt",
-]
-
-[[values]]
-name = "INIT"
-
-[[values]]
-name = "FWD"
-
-[[values]]
-name = "BWD"
diff --git a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml b/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml
deleted file mode 100644
index 50349d5773..0000000000
--- a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-namespace = "FlexFlow"
-name = "task_id_with_noop_default_t"
-type = "variant"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "fmt",
-  "rapidcheck",
-]
-
-includes = [
-  "task-spec/task_id_t.dtg.h",
-  "<utility>",
-]
-
-src_includes = [
-  "utils/rapidcheck/monostate.h",
-  "utils/fmt/monostate.h",
-]
-
-[[values]]
-type = "::FlexFlow::task_id_t"
-key = "real_task"
-
-[[values]]
-type = "std::monostate"
-key = "noop_task"
diff --git a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h b/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h
deleted file mode 100644
index 054b73844e..0000000000
--- a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ID_WITH_NOOP_DEFAULT_T_H
-#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ID_WITH_NOOP_DEFAULT_T_H
-
-#include "op-attrs/computation_graph_op_attrs.dtg.h"
-#include "op-attrs/operator_type.dtg.h"
-#include "task-spec/ops/op_task_id_t.dtg.h"
-#include "task-spec/task_id_with_noop_default_t.dtg.h"
-
-namespace FlexFlow {
-
-task_id_with_noop_default_t lift_task_id_t(task_id_t);
-task_id_with_noop_default_t default_noop_task();
-
-task_id_with_noop_default_t lower_op_task_id_to_task_id_with_noop_default_t(
-    op_task_id_t, ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_init_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_fwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_bwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc b/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc
deleted file mode 100644
index 20e0d00c57..0000000000
--- a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "task-spec/task_id_with_noop_default_t.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-task_id_with_noop_default_t lift_task_id_t(task_id_t task_id) {
-  return task_id_with_noop_default_t{task_id};
-}
-
-task_id_with_noop_default_t default_noop_task() {
-  return task_id_with_noop_default_t{std::monostate{}};
-}
-
-task_id_with_noop_default_t lower_op_task_id_to_task_id_with_noop_default_t(
-    op_task_id_t op_task_id, ComputationGraphOpAttrs const &op_attrs) {
-  switch (op_task_id) {
-    case op_task_id_t::INIT:
-      return get_init_task_id_for_op_attrs(op_attrs);
-    case op_task_id_t::FWD:
-      return get_fwd_task_id_for_op_attrs(op_attrs);
-    case op_task_id_t::BWD:
-      return get_bwd_task_id_for_op_attrs(op_attrs);
-    default:
-      PANIC("Unhandled op_task_id_t", op_task_id);
-  }
-}
-
-task_id_with_noop_default_t
-    get_init_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) { return default_noop_task(); },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_INIT_TASK_ID);
-      },
-      [](BroadcastAttrs const &) { return default_noop_task(); },
-      [](CastAttrs const &) { return default_noop_task(); },
-      [](ConcatAttrs const &) { return default_noop_task(); },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_INIT_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_INIT_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_INIT_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_INIT_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) { return default_noop_task(); },
-      [](FlatAttrs const &) { return default_noop_task(); },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_INIT_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_INIT_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_INIT_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_INIT_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_INIT_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_INIT_TASK_ID);
-      },
-      [](ReshapeAttrs const &) { return default_noop_task(); },
-      [](ReverseAttrs const &) { return default_noop_task(); },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_INIT_TASK_ID);
-      },
-      [](SplitAttrs const &) { return default_noop_task(); },
-      [](TopKAttrs const &) { return default_noop_task(); },
-      [](TransposeAttrs const &) { return default_noop_task(); },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-task_id_with_noop_default_t
-    get_fwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHMATMUL_FWD_TASK_ID);
-      },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_FWD_TASK_ID);
-      },
-      [](BroadcastAttrs const &) {
-        return lift_task_id_t(task_id_t::BROADCAST_FWD_TASK_ID);
-      },
-      [](CastAttrs const &) {
-        return lift_task_id_t(task_id_t::CAST_FWD_TASK_ID);
-      },
-      [](ConcatAttrs const &) {
-        return lift_task_id_t(task_id_t::CONCAT_FWD_TASK_ID);
-      },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_FWD_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_FWD_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_FWD_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_FWD_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) {
-        return lift_task_id_t(task_id_t::EMBED_FWD_TASK_ID);
-      },
-      [](FlatAttrs const &) {
-        return lift_task_id_t(task_id_t::FLAT_FWD_TASK_ID);
-      },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_FWD_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_FWD_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_FWD_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_FWD_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_FWD_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_FWD_TASK_ID);
-      },
-      [](ReshapeAttrs const &) {
-        return lift_task_id_t(task_id_t::RESHAPE_FWD_TASK_ID);
-      },
-      [](ReverseAttrs const &) {
-        return lift_task_id_t(task_id_t::REVERSE_FWD_TASK_ID);
-      },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_FWD_TASK_ID);
-      },
-      [](SplitAttrs const &) {
-        return lift_task_id_t(task_id_t::SPLIT_FWD_TASK_ID);
-      },
-      [](TopKAttrs const &) {
-        return lift_task_id_t(task_id_t::TOPK_FWD_TASK_ID);
-      },
-      [](TransposeAttrs const &) {
-        return lift_task_id_t(task_id_t::TRANSPOSE_FWD_TASK_ID);
-      },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-task_id_with_noop_default_t
-    get_bwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHMATMUL_BWD_TASK_ID);
-      },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_BWD_TASK_ID);
-      },
-      [](BroadcastAttrs const &) {
-        return lift_task_id_t(task_id_t::BROADCAST_BWD_TASK_ID);
-      },
-      [](CastAttrs const &) {
-        return lift_task_id_t(task_id_t::CAST_BWD_TASK_ID);
-      },
-      [](ConcatAttrs const &) {
-        return lift_task_id_t(task_id_t::CONCAT_BWD_TASK_ID);
-      },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_BWD_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_BWD_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_BWD_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_BWD_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) {
-        return lift_task_id_t(task_id_t::EMBED_BWD_TASK_ID);
-      },
-      [](FlatAttrs const &) {
-        return lift_task_id_t(task_id_t::FLAT_BWD_TASK_ID);
-      },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_BWD_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_BWD_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_BWD_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_BWD_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_BWD_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_BWD_TASK_ID);
-      },
-      [](ReshapeAttrs const &) {
-        return lift_task_id_t(task_id_t::RESHAPE_BWD_TASK_ID);
-      },
-      [](ReverseAttrs const &) {
-        return lift_task_id_t(task_id_t::REVERSE_BWD_TASK_ID);
-      },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_BWD_TASK_ID);
-      },
-      [](SplitAttrs const &) {
-        return lift_task_id_t(task_id_t::SPLIT_BWD_TASK_ID);
-      },
-      [](TopKAttrs const &) {
-        return lift_task_id_t(task_id_t::TOPK_BWD_TASK_ID);
-      },
-      [](TransposeAttrs const &) {
-        return lift_task_id_t(task_id_t::TRANSPOSE_BWD_TASK_ID);
-      },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-} // namespace FlexFlow

From 32c66578a2dc23249feaac5e9845f15fb4740788 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 5 Feb 2026 14:34:22 -0800
Subject: [PATCH 013/113] Avoid pulling in the entire invocation.

---
 .../include/realm-execution/task_id_t.h                |  4 ++--
 lib/realm-execution/src/realm-execution/task_id_t.cc   | 10 ++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/task_id_t.h b/lib/realm-execution/include/realm-execution/task_id_t.h
index af20dc27f6..38b82ad9e0 100644
--- a/lib/realm-execution/include/realm-execution/task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/task_id_t.h
@@ -4,13 +4,13 @@
 #include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "realm-execution/task_id_t.dtg.h"
-#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
 #include <optional>
 
 namespace FlexFlow {
 
 std::optional<task_id_t>
-    get_task_id_for_op(DynamicNodeInvocation const &,
+    get_task_id_for_op(DynamicNodeAttrs const &,
                        std::optional<OptimizerAttrs> const &);
 
 std::optional<task_id_t>
diff --git a/lib/realm-execution/src/realm-execution/task_id_t.cc b/lib/realm-execution/src/realm-execution/task_id_t.cc
index 94b5fb5b24..574dbb1e54 100644
--- a/lib/realm-execution/src/realm-execution/task_id_t.cc
+++ b/lib/realm-execution/src/realm-execution/task_id_t.cc
@@ -8,16 +8,14 @@
 namespace FlexFlow {
 
 std::optional<task_id_t>
-    get_task_id_for_op(DynamicNodeInvocation const &invocation,
+    get_task_id_for_op(DynamicNodeAttrs const &node_attrs,
                        std::optional<OptimizerAttrs> const &optimizer_attrs) {
-  DynamicTaskType task_type = invocation.node_attrs.task_type.value();
+  DynamicTaskType task_type = node_attrs.task_type.value();
   switch (task_type) {
     case DynamicTaskType::FWD:
-      return get_fwd_task_id_for_op_attrs(
-          invocation.node_attrs.op_attrs.value());
+      return get_fwd_task_id_for_op_attrs(node_attrs.op_attrs.value());
     case DynamicTaskType::BWD:
-      return get_bwd_task_id_for_op_attrs(
-          invocation.node_attrs.op_attrs.value());
+      return get_bwd_task_id_for_op_attrs(node_attrs.op_attrs.value());
     case DynamicTaskType::UPD:
       return get_update_task_id_for_optimizer_attrs(optimizer_attrs.value());
     case DynamicTaskType::LOSS:

From 40424c77058a55a9dc084c62c6b39565b166adcb Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 5 Feb 2026 14:45:46 -0800
Subject: [PATCH 014/113] Conversion into Realm task IDs.

---
 .../include/realm-execution/realm_task_id_t.h       | 13 +++++++++++++
 .../src/realm-execution/realm_manager.cc            |  5 ++++-
 .../src/realm-execution/realm_task_id_t.cc          | 10 ++++++++++
 3 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 lib/realm-execution/include/realm-execution/realm_task_id_t.h
 create mode 100644 lib/realm-execution/src/realm-execution/realm_task_id_t.cc

diff --git a/lib/realm-execution/include/realm-execution/realm_task_id_t.h b/lib/realm-execution/include/realm-execution/realm_task_id_t.h
new file mode 100644
index 0000000000..6d2e316b14
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_task_id_t.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
+
+#include "realm-execution/task_id_t.dtg.h"
+#include "realm.h"
+
+namespace FlexFlow {
+
+Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 0ccf3f4116..747f603f5d 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -1,4 +1,6 @@
 #include "realm-execution/realm_manager.h"
+#include "realm-execution/realm_task_id_t.h"
+#include "realm-execution/task_id_t.dtg.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -27,7 +29,8 @@ RealmManager::~RealmManager() {
 
 Realm::Event
     RealmManager::start_controller(std::function<void(RealmManager &)> thunk) {
-  constexpr int CONTROLLER_TASK_ID = Realm::Processor::TASK_ID_FIRST_AVAILABLE;
+  Realm::Processor::TaskFuncID CONTROLLER_TASK_ID =
+      get_realm_task_id_for_task_id(task_id_t::CONTROLLER_TASK_ID);
   Realm::Event task_ready = Realm::Processor::register_task_by_kind(
       Realm::Processor::LOC_PROC,
       /*global=*/false,
diff --git a/lib/realm-execution/src/realm-execution/realm_task_id_t.cc b/lib/realm-execution/src/realm-execution/realm_task_id_t.cc
new file mode 100644
index 0000000000..50b23dfe86
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_task_id_t.cc
@@ -0,0 +1,10 @@
+#include "realm-execution/realm_task_id_t.h"
+
+namespace FlexFlow {
+
+Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t task_id) {
+  return Realm::Processor::TASK_ID_FIRST_AVAILABLE +
+         static_cast<Realm::Processor::TaskFuncID>(task_id);
+}
+
+} // namespace FlexFlow

From 364eb5f3349365890187e24eef93a0f4e980b10a Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 5 Feb 2026 15:49:20 -0800
Subject: [PATCH 015/113] Add a top-level PRealm switch.

---
 .../include/realm-execution/realm.h           | 20 +++++++++++++++++++
 .../include/realm-execution/realm_manager.h   |  2 +-
 .../include/realm-execution/realm_task_id_t.h |  2 +-
 .../src/realm-execution/task_id_t.cc          |  1 -
 .../test/src/realm-execution/realm_manager.cc |  2 +-
 .../test/src/realm-execution/test_e2e.cc      |  3 ++-
 6 files changed, 25 insertions(+), 5 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/realm.h

diff --git a/lib/realm-execution/include/realm-execution/realm.h b/lib/realm-execution/include/realm-execution/realm.h
new file mode 100644
index 0000000000..f15113ee92
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_H
+
+#ifdef FLEXFLOW_USE_PREALM
+#include <realm/prealm/prealm.h>
+#else
+#include <realm.h>
+#endif
+
+namespace FlexFlow {
+
+#ifdef FLEXFLOW_USE_PREALM
+namespace Realm = ::PRealm;
+#else
+namespace Realm = ::Realm;
+#endif
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 88cc11f744..b26adea548 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -4,7 +4,7 @@
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
 #include "pcg/device_id_t.dtg.h"
-#include "realm.h"
+#include "realm-execution/realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/realm_task_id_t.h b/lib/realm-execution/include/realm-execution/realm_task_id_t.h
index 6d2e316b14..8e6da1a2bd 100644
--- a/lib/realm-execution/include/realm-execution/realm_task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/realm_task_id_t.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
 
+#include "realm-execution/realm.h"
 #include "realm-execution/task_id_t.dtg.h"
-#include "realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/src/realm-execution/task_id_t.cc b/lib/realm-execution/src/realm-execution/task_id_t.cc
index 574dbb1e54..3521f50c02 100644
--- a/lib/realm-execution/src/realm-execution/task_id_t.cc
+++ b/lib/realm-execution/src/realm-execution/task_id_t.cc
@@ -2,7 +2,6 @@
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
-#include "task-spec/dynamic_graph/dynamic_task_type.dtg.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index 16b5338881..f9fbd986c2 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // Launch a controller
     int some_data = 123;
-    Realm::Event event = manager.start_controller(
+    FlexFlow::Realm::Event event = manager.start_controller(
         [&](RealmManager &manager) { ASSERT(some_data == 123); });
     // Need to block on the completion of the event to ensure we don't race
     event.wait();
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 623b8318e6..fa9f798e4f 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -11,7 +11,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
     RealmManager manager(&fake_argc, &fake_argv);
-    Realm::Event event = manager.start_controller([](RealmManager &manager) {});
+    FlexFlow::Realm::Event event =
+        manager.start_controller([](RealmManager &manager) {});
     event.wait();
   }
 }

From f8ac308e52ab193ed94f3d973fa5da0f6ab7ccde Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 5 Feb 2026 17:08:38 -0800
Subject: [PATCH 016/113] Some work on Realm task registry.

---
 .../realm-execution/realm_task_registry.h     | 13 +++++
 .../realm-execution/realm_task_registry.cc    | 55 +++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 lib/realm-execution/include/realm-execution/realm_task_registry.h
 create mode 100644 lib/realm-execution/src/realm-execution/realm_task_registry.cc

diff --git a/lib/realm-execution/include/realm-execution/realm_task_registry.h b/lib/realm-execution/include/realm-execution/realm_task_registry.h
new file mode 100644
index 0000000000..3a4cee106c
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_task_registry.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+Realm::Event register_all_tasks();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/realm_task_registry.cc
new file mode 100644
index 0000000000..a5e52b7a7c
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_task_registry.cc
@@ -0,0 +1,55 @@
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_task_id_t.h"
+#include "realm-execution/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+void op_task_wrapper(
+    void const *, size_t, void const *, size_t, Realm::Processor) {}
+
+static Realm::Event register_task(Realm::Processor::Kind target_kind,
+                                  task_id_t func_id,
+                                  void (*task_body)(void const *,
+                                                    size_t,
+                                                    void const *,
+                                                    size_t,
+                                                    Realm::Processor)) {
+  return Realm::Processor::register_task_by_kind(
+      target_kind,
+      /*global=*/false,
+      get_realm_task_id_for_task_id(func_id),
+      Realm::CodeDescriptor(task_body),
+      Realm::ProfilingRequestSet());
+}
+
+Realm::Event register_all_tasks() {
+  std::vector<Realm::Event> pending_registrations;
+
+  std::vector<task_id_t> init_task_ids = {
+      task_id_t::BATCHNORM_INIT_TASK_ID,
+      task_id_t::COMBINE_INIT_TASK_ID,
+      task_id_t::CONV2D_INIT_TASK_ID,
+      task_id_t::DROPOUT_INIT_TASK_ID,
+      task_id_t::ELEMENTBINARY_INIT_TASK_ID,
+      task_id_t::ELEMENTUNARY_INIT_TASK_ID,
+      task_id_t::GATHER_INIT_TASK_ID,
+      task_id_t::LAYERNORM_INIT_TASK_ID,
+      task_id_t::LINEAR_INIT_TASK_ID,
+      task_id_t::ATTENTION_INIT_TASK_ID,
+      task_id_t::POOL2D_INIT_TASK_ID,
+      task_id_t::REDUCE_INIT_TASK_ID,
+      task_id_t::REDUCTION_INIT_TASK_ID,
+      task_id_t::REPARTITION_INIT_TASK_ID,
+      task_id_t::REPLICATE_INIT_TASK_ID,
+      task_id_t::SOFTMAX_INIT_TASK_ID,
+  };
+
+  for (task_id_t init_task_id : init_task_ids) {
+    pending_registrations.push_back(register_task(
+        Realm::Processor::LOC_PROC, init_task_id, op_task_wrapper));
+  }
+
+  return Realm::Event::merge_events(pending_registrations);
+}
+
+} // namespace FlexFlow

From f79cd85d9386ddf09f161630812c601d4d85a625 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 09:20:53 -0800
Subject: [PATCH 017/113] Split out the Realm context.

---
 .../parallel_computation_graph_instance.h     |  8 +--
 .../include/realm-execution/realm_context.h   | 34 +++++++++++
 .../include/realm-execution/realm_manager.h   | 25 ++------
 .../parallel_computation_graph_instance.cc    |  4 +-
 .../src/realm-execution/realm_context.cc      | 34 +++++++++++
 .../src/realm-execution/realm_manager.cc      | 60 +++++--------------
 .../test/src/realm-execution/realm_manager.cc |  2 +-
 .../test/src/realm-execution/test_e2e.cc      |  4 +-
 8 files changed, 96 insertions(+), 75 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/realm_context.h
 create mode 100644 lib/realm-execution/src/realm-execution/realm_context.cc

diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
index 4ba77a7925..0dd87d566f 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -9,7 +9,7 @@
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
-#include "realm-execution/realm_manager.h"
+#include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
@@ -21,7 +21,7 @@ namespace FlexFlow {
 
 struct ParallelComputationGraphInstance {
 public:
-  ParallelComputationGraphInstance(RealmManager &,
+  ParallelComputationGraphInstance(RealmContext &,
                                    DynamicOpenDataflowGraph,
                                    std::vector<DynamicNodeInvocation> const &,
                                    OptimizerAttrs const &,
@@ -36,7 +36,7 @@ struct ParallelComputationGraphInstance {
   std::optional<GenericTensorAccessorR> get_loss_tensor_accessor() const;
 
 private:
-  RealmManager &realm;
+  RealmContext &realm;
   DynamicOpenDataflowGraph dataflow_graph;
   std::vector<DynamicNodeInvocation> topological_ordering;
   OptimizerAttrs optimizer_attrs;
@@ -45,7 +45,7 @@ struct ParallelComputationGraphInstance {
 };
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
-    RealmManager &realm,
+    RealmContext &realm,
     ParallelComputationGraph const &pcg,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
new file mode 100644
index 0000000000..5539fe693e
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
+
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "realm-execution/realm.h"
+
+namespace FlexFlow {
+
+struct RealmContext {
+public:
+  RealmContext();
+  virtual ~RealmContext();
+
+  RealmContext(RealmContext const &) = delete;
+  RealmContext(RealmContext &&) = delete;
+
+  // Current device context
+  Allocator &get_current_device_allocator() const;
+  device_handle_t const &get_current_device_handle() const;
+  device_id_t const &get_current_device_idx() const;
+
+protected:
+  [[nodiscard]] Realm::Event merge_outstanding_events();
+
+protected:
+  Realm::Runtime runtime;
+  std::vector<Realm::Event> outstanding_events;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index b26adea548..bf5e8f72f1 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -5,38 +5,21 @@
 #include "kernels/device_handle_t.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
 
 namespace FlexFlow {
 
-struct RealmManager {
+struct RealmManager : private RealmContext {
 public:
   RealmManager(int *argc, char ***argv);
-  ~RealmManager();
+  virtual ~RealmManager();
 
   RealmManager() = delete;
   RealmManager(RealmManager const &) = delete;
   RealmManager(RealmManager &&) = delete;
 
   [[nodiscard]] Realm::Event
-      start_controller(std::function<void(RealmManager &)>);
-
-  // Current device context
-  Allocator &get_current_device_allocator() const;
-  device_handle_t const &get_current_device_handle() const;
-  device_id_t const &get_current_device_idx() const;
-
-private:
-  RealmManager(void const *, size_t, void const *, size_t, Realm::Processor);
-
-  [[nodiscard]] Realm::Event merge_outstanding_events();
-
-  static void controller_task_wrapper(
-      void const *, size_t, void const *, size_t, Realm::Processor);
-
-private:
-  Realm::Runtime runtime;
-  std::vector<Realm::Event> outstanding_events;
-  bool is_root_runtime;
+      start_controller(std::function<void(RealmContext &)>);
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index 64c9da2f4c..c8100287f8 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -12,7 +12,7 @@
 namespace FlexFlow {
 
 ParallelComputationGraphInstance::ParallelComputationGraphInstance(
-    RealmManager &realm,
+    RealmContext &realm,
     DynamicOpenDataflowGraph dataflow_graph,
     std::vector<DynamicNodeInvocation> const &topological_ordering,
     OptimizerAttrs const &optimizer_attrs,
@@ -61,7 +61,7 @@ static GenericTensorAccessorW
 }
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
-    RealmManager &realm,
+    RealmContext &realm,
     ParallelComputationGraph const &pcg,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
new file mode 100644
index 0000000000..5068373ebe
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -0,0 +1,34 @@
+#include "realm-execution/realm_context.h"
+#include "realm-execution/realm_task_id_t.h"
+#include "realm-execution/task_id_t.dtg.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+RealmContext::RealmContext() {}
+
+RealmContext::~RealmContext() {
+  if (!this->outstanding_events.empty()) {
+    Realm::Event outstanding = this->merge_outstanding_events();
+    outstanding.wait();
+  }
+}
+
+Allocator &RealmContext::get_current_device_allocator() const {
+  NOT_IMPLEMENTED();
+}
+
+device_handle_t const &RealmContext::get_current_device_handle() const {
+  NOT_IMPLEMENTED();
+}
+device_id_t const &RealmContext::get_current_device_idx() const {
+  NOT_IMPLEMENTED();
+}
+
+Realm::Event RealmContext::merge_outstanding_events() {
+  Realm::Event result = Realm::Event::merge_events(this->outstanding_events);
+  this->outstanding_events.clear();
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 747f603f5d..501ba7536a 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -5,37 +5,39 @@
 
 namespace FlexFlow {
 
-RealmManager::RealmManager(int *argc, char ***argv) : is_root_runtime(true) {
+RealmManager::RealmManager(int *argc, char ***argv) {
   bool ok = this->runtime.init(argc, argv);
   ASSERT(ok);
 }
 
-RealmManager::RealmManager(void const *args,
-                           size_t arglen,
-                           void const *userdata,
-                           size_t userdatalen,
-                           Realm::Processor proc)
-    : runtime(Realm::Runtime::get_runtime()), is_root_runtime(false) {}
-
 RealmManager::~RealmManager() {
   Realm::Event outstanding = this->merge_outstanding_events();
-  if (is_root_runtime) {
     this->runtime.shutdown(outstanding);
     this->runtime.wait_for_shutdown();
-  } else {
-    outstanding.wait();
-  }
+}
+
+static void controller_task_wrapper(void const *args,
+                                           size_t arglen,
+                                           void const *userdata,
+                                           size_t userlen,
+                                           Realm::Processor proc) {
+  ASSERT(arglen == sizeof(std::function<void(RealmContext &)>));
+  std::function<void(RealmContext &)> thunk =
+      *reinterpret_cast<std::function<void(RealmContext &)> const *>(args);
+
+  RealmContext ctx;
+  thunk(ctx);
 }
 
 Realm::Event
-    RealmManager::start_controller(std::function<void(RealmManager &)> thunk) {
+    RealmManager::start_controller(std::function<void(RealmContext &)> thunk) {
   Realm::Processor::TaskFuncID CONTROLLER_TASK_ID =
       get_realm_task_id_for_task_id(task_id_t::CONTROLLER_TASK_ID);
   Realm::Event task_ready = Realm::Processor::register_task_by_kind(
       Realm::Processor::LOC_PROC,
       /*global=*/false,
       CONTROLLER_TASK_ID,
-      Realm::CodeDescriptor(RealmManager::controller_task_wrapper),
+      Realm::CodeDescriptor(controller_task_wrapper),
       Realm::ProfilingRequestSet(),
       &thunk,
       sizeof(thunk));
@@ -51,34 +53,4 @@ Realm::Event
   return task_complete;
 }
 
-Allocator &RealmManager::get_current_device_allocator() const {
-  NOT_IMPLEMENTED();
-}
-
-device_handle_t const &RealmManager::get_current_device_handle() const {
-  NOT_IMPLEMENTED();
-}
-device_id_t const &RealmManager::get_current_device_idx() const {
-  NOT_IMPLEMENTED();
-}
-
-Realm::Event RealmManager::merge_outstanding_events() {
-  Realm::Event result = Realm::Event::merge_events(this->outstanding_events);
-  this->outstanding_events.clear();
-  return result;
-}
-
-void RealmManager::controller_task_wrapper(void const *args,
-                                           size_t arglen,
-                                           void const *userdata,
-                                           size_t userlen,
-                                           Realm::Processor proc) {
-  ASSERT(arglen == sizeof(std::function<void(RealmManager &)>));
-  std::function<void(RealmManager &)> thunk =
-      *reinterpret_cast<std::function<void(RealmManager &)> const *>(args);
-
-  RealmManager manager(args, arglen, userdata, userlen, proc);
-  thunk(manager);
-}
-
 } // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index f9fbd986c2..6c28a001ad 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     // Launch a controller
     int some_data = 123;
     FlexFlow::Realm::Event event = manager.start_controller(
-        [&](RealmManager &manager) { ASSERT(some_data == 123); });
+        [&](RealmContext &ctx) { ASSERT(some_data == 123); });
     // Need to block on the completion of the event to ensure we don't race
     event.wait();
   }
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index fa9f798e4f..a30d5c4d8e 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -11,8 +11,6 @@ TEST_SUITE(FF_TEST_SUITE) {
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
     RealmManager manager(&fake_argc, &fake_argv);
-    FlexFlow::Realm::Event event =
-        manager.start_controller([](RealmManager &manager) {});
-    event.wait();
+    (void)manager.start_controller([](RealmContext &ctx) {});
   }
 }

From 0cef52ac5d3e5ee003a794ccb54be43acabb747b Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 09:32:02 -0800
Subject: [PATCH 018/113] Switch to mapped PCG.

---
 .../parallel_computation_graph_instance.h     |  4 ++--
 .../parallel_computation_graph_instance.cc    |  7 ++++---
 .../src/realm-execution/realm_manager.cc      | 12 ++++++------
 ...ke_dynamic_open_dataflow_graph_from_mpcg.h | 14 ++++++++++++++
 ...ake_dynamic_open_dataflow_graph_from_pcg.h | 14 --------------
 ..._dynamic_open_dataflow_graph_from_mpcg.cc} | 19 ++++++++++---------
 6 files changed, 36 insertions(+), 34 deletions(-)
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
 delete mode 100644 lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h
 rename lib/task-spec/src/task-spec/dynamic_graph/{make_dynamic_open_dataflow_graph_from_pcg.cc => make_dynamic_open_dataflow_graph_from_mpcg.cc} (84%)

diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
index 0dd87d566f..06c2d2d912 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -7,8 +7,8 @@
 #include "kernels/profiling_settings.dtg.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/device_id_t.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
@@ -46,7 +46,7 @@ struct ParallelComputationGraphInstance {
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     RealmContext &realm,
-    ParallelComputationGraph const &pcg,
+    MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
     std::optional<GenericTensorAccessorR> label_tensor,
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index c8100287f8..e7bf79f12d 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -4,7 +4,7 @@
 #include "pcg/optimizer_attrs.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/loss_insertion.h"
-#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h"
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
 #include "task-spec/dynamic_graph/pass_expansion.h"
 #include "task-spec/dynamic_graph/update_insertion.h"
 #include "utils/exception.h"
@@ -62,7 +62,7 @@ static GenericTensorAccessorW
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     RealmContext &realm,
-    ParallelComputationGraph const &pcg,
+    MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
     std::optional<GenericTensorAccessorR> label_tensor,
@@ -72,7 +72,8 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     ProfilingSettings const &profiling_settings,
     FFIterationConfig const &iteration_config) {
 
-  DynamicOpenDataflowGraph dg = make_dynamic_open_dataflow_graph_from_pcg(pcg);
+  DynamicOpenDataflowGraph dg =
+      make_dynamic_open_dataflow_graph_from_mpcg(mpcg);
   dg = perform_pass_expansion(dg);
 
   std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> inputs =
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 501ba7536a..0c34d77204 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -12,15 +12,15 @@ RealmManager::RealmManager(int *argc, char ***argv) {
 
 RealmManager::~RealmManager() {
   Realm::Event outstanding = this->merge_outstanding_events();
-    this->runtime.shutdown(outstanding);
-    this->runtime.wait_for_shutdown();
+  this->runtime.shutdown(outstanding);
+  this->runtime.wait_for_shutdown();
 }
 
 static void controller_task_wrapper(void const *args,
-                                           size_t arglen,
-                                           void const *userdata,
-                                           size_t userlen,
-                                           Realm::Processor proc) {
+                                    size_t arglen,
+                                    void const *userdata,
+                                    size_t userlen,
+                                    Realm::Processor proc) {
   ASSERT(arglen == sizeof(std::function<void(RealmContext &)>));
   std::function<void(RealmContext &)> thunk =
       *reinterpret_cast<std::function<void(RealmContext &)> const *>(args);
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
new file mode 100644
index 0000000000..758a0c2813
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_MPCG_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_MPCG_H
+
+#include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
+    MappedParallelComputationGraph const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h
deleted file mode 100644
index a71eb558c1..0000000000
--- a/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_PCG_H
-#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_PCG_H
-
-#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
-#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
-
-namespace FlexFlow {
-
-DynamicOpenDataflowGraph
-    make_dynamic_open_dataflow_graph_from_pcg(ParallelComputationGraph const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
similarity index 84%
rename from lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.cc
rename to lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
index 841be27dfd..e90ef10398 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
@@ -1,4 +1,4 @@
-#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_pcg.h"
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/pcg_operator_attrs.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
@@ -13,26 +13,27 @@
 
 namespace FlexFlow {
 
-DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_pcg(
-    ParallelComputationGraph const &pcg) {
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
+    MappedParallelComputationGraph const &mpcg) {
   DynamicOpenDataflowGraph result = make_empty_dynamic_open_dataflow_graph();
 
-  for (auto const &[layer, attrs] : get_parallel_layer_attrs_mapping(pcg)) {
+  for (auto const &[layer, attrs] :
+       get_parallel_layer_attrs_mapping(mpcg.pcg)) {
     DynamicNodeAttrs result_attrs{
         /*task_type=*/std::nullopt,
         /*device_coord=*/std::nullopt,
-        /*mapping=*/std::nullopt,
+        /*mapping=*/mpcg.mapped_tasks.at(layer),
         /*op_attrs=*/attrs.op_attrs,
         /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
         /*per_device_op_state=*/std::nullopt,
     };
 
     std::unordered_map<DynamicTensorSlot, DynamicValueAttrs> result_inputs =
-        transform(get_incoming_tensors(pcg, layer),
+        transform(get_incoming_tensors(mpcg.pcg, layer),
                   [&](TensorSlotName const &slot_name,
                       parallel_tensor_guid_t const &tensor) {
                     ParallelTensorAttrs attrs =
-                        get_parallel_tensor_attrs(pcg, tensor);
+                        get_parallel_tensor_attrs(mpcg.pcg, tensor);
                     return std::pair<DynamicTensorSlot, DynamicValueAttrs>{
                         DynamicTensorSlot{
                             /*slot_name=*/slot_name,
@@ -48,11 +49,11 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_pcg(
                     };
                   });
     std::unordered_map<DynamicTensorSlot, DynamicValueAttrs> result_outputs =
-        transform(get_outgoing_tensors(pcg, layer),
+        transform(get_outgoing_tensors(mpcg.pcg, layer),
                   [&](TensorSlotName const &slot_name,
                       parallel_tensor_guid_t const &tensor) {
                     ParallelTensorAttrs attrs =
-                        get_parallel_tensor_attrs(pcg, tensor);
+                        get_parallel_tensor_attrs(mpcg.pcg, tensor);
                     return std::pair<DynamicTensorSlot, DynamicValueAttrs>{
                         DynamicTensorSlot{
                             /*slot_name=*/slot_name,

From 40b5f34bf8139a29e6cdc93d542f3ae6f2812783 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 09:54:57 -0800
Subject: [PATCH 019/113] Add shard expansion pass (and implement shard
 expansion pass).

---
 .../parallel_computation_graph_instance.h         |  3 ++-
 .../parallel_computation_graph_instance.cc        |  9 ++++++---
 .../task-spec/dynamic_graph/shard_expansion.cc    | 15 +++++++++++++++
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
index 06c2d2d912..f361cec3ca 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -9,6 +9,7 @@
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
@@ -50,7 +51,7 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
     std::optional<GenericTensorAccessorR> label_tensor,
-    std::optional<dynamic_tensor_guid_t> logit_tensor,
+    std::optional<parallel_tensor_guid_t> logit_tensor,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index e7bf79f12d..80ed98f8c2 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -3,9 +3,11 @@
 #include "local-execution/tensor_allocation.h"
 #include "pcg/optimizer_attrs.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion.h"
 #include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
 #include "task-spec/dynamic_graph/pass_expansion.h"
+#include "task-spec/dynamic_graph/shard_expansion.h"
 #include "task-spec/dynamic_graph/update_insertion.h"
 #include "utils/exception.h"
 
@@ -66,7 +68,7 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
     std::optional<GenericTensorAccessorR> label_tensor,
-    std::optional<dynamic_tensor_guid_t> logit_tensor,
+    std::optional<parallel_tensor_guid_t> logit_tensor,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
@@ -81,13 +83,14 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
   std::optional<DynamicValueAttrs> logit_grad_value;
   if (loss_attrs) {
     auto [dg2, label_v, logit_grad_v] = perform_loss_insertion(
-        dg, assert_unwrap(loss_attrs), assert_unwrap(logit_tensor));
+        dg, loss_attrs.value(), dynamic_tensor_guid_t{logit_tensor.value()});
     dg = dg2;
     logit_grad_value = logit_grad_v;
-    inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
+    inputs.insert(std::pair{label_v, label_tensor.value()});
   }
 
   dg = perform_update_insertion(dg, optimizer_attrs);
+  dg = perform_shard_expansion(dg);
   dg = perform_tensor_allocation(
       dg, inputs, realm.get_current_device_allocator());
 
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
index ea253b63f8..33b7fb8591 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -81,4 +81,19 @@ std::unordered_set<DynamicNodeInvocation>
       });
 }
 
+DynamicOpenDataflowGraph
+    perform_shard_expansion(DynamicOpenDataflowGraph const &g) {
+
+  ASSERT(no_part_of_graph_is_shard_expanded(g));
+
+  DynamicOpenDataflowGraph result =
+      flatmap_dynamic_invocation_set(g, [&](DynamicNodeInvocation const &i) {
+        return perform_shard_expansion_for_invocation(i);
+      });
+
+  ASSERT(graph_is_fully_shard_expanded(result));
+
+  return result;
+}
+
 } // namespace FlexFlow

From 048079fdcb13cf3ca4b2f981c9a094b9fd414e73 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 10:47:43 -0800
Subject: [PATCH 020/113] Add instance field to dynamic graph, more task IDs.

---
 .../include/realm-execution/realm_context.h   |  2 +-
 .../include/realm-execution/realm_manager.h   |  2 +-
 .../include/realm-execution/realm_task_id_t.h |  2 +-
 .../realm-execution/realm_task_registry.h     |  4 +-
 .../realm-execution/realm_task_registry.cc    | 81 +++++++++++++++++--
 lib/task-spec/CMakeLists.txt                  |  1 +
 .../dynamic_value_attrs.dtg.toml              |  6 ++
 .../include/task-spec/realm/fmt/instance.h    | 35 ++++++++
 .../include/task-spec/realm}/realm.h          |  4 +-
 .../task-spec/dynamic_graph/loss_insertion.cc |  2 +
 ...ake_dynamic_open_dataflow_graph_from_cg.cc |  2 +
 ...e_dynamic_open_dataflow_graph_from_mpcg.cc |  2 +
 .../dynamic_graph/update_insertion.cc         |  1 +
 .../src/task-spec/realm/fmt/instance.h        | 10 +++
 .../dynamic_open_dataflow_graph.cc            |  3 +
 .../dynamic_graph/machine_slicing.cc          |  1 +
 .../task-spec/dynamic_graph/pass_expansion.cc |  3 +
 .../dynamic_graph/shard_expansion.cc          |  1 +
 18 files changed, 148 insertions(+), 14 deletions(-)
 create mode 100644 lib/task-spec/include/task-spec/realm/fmt/instance.h
 rename lib/{realm-execution/include/realm-execution => task-spec/include/task-spec/realm}/realm.h (63%)
 create mode 100644 lib/task-spec/src/task-spec/realm/fmt/instance.h

diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 5539fe693e..357b05b699 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -4,7 +4,7 @@
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
 #include "pcg/device_id_t.dtg.h"
-#include "realm-execution/realm.h"
+#include "task-spec/realm/realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index bf5e8f72f1..ebf3bb401e 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -4,8 +4,8 @@
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
 #include "pcg/device_id_t.dtg.h"
-#include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
+#include "task-spec/realm/realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/realm_task_id_t.h b/lib/realm-execution/include/realm-execution/realm_task_id_t.h
index 8e6da1a2bd..327cf9ffd0 100644
--- a/lib/realm-execution/include/realm-execution/realm_task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/realm_task_id_t.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
 
-#include "realm-execution/realm.h"
 #include "realm-execution/task_id_t.dtg.h"
+#include "task-spec/realm/realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/realm_task_registry.h b/lib/realm-execution/include/realm-execution/realm_task_registry.h
index 3a4cee106c..d9d993795b 100644
--- a/lib/realm-execution/include/realm-execution/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/realm_task_registry.h
@@ -1,12 +1,12 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
 
-#include "realm-execution/realm.h"
 #include "realm-execution/task_id_t.dtg.h"
+#include "task-spec/realm/realm.h"
 
 namespace FlexFlow {
 
-Realm::Event register_all_tasks();
+[[nodiscard]] Realm::Event register_all_tasks();
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/src/realm-execution/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/realm_task_registry.cc
index a5e52b7a7c..5c61c208fb 100644
--- a/lib/realm-execution/src/realm-execution/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/realm_task_registry.cc
@@ -1,11 +1,13 @@
-#include "realm-execution/realm.h"
+#include "realm-execution/realm_task_registry.h"
 #include "realm-execution/realm_task_id_t.h"
-#include "realm-execution/task_id_t.dtg.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-void op_task_wrapper(
-    void const *, size_t, void const *, size_t, Realm::Processor) {}
+static void operation_task_wrapper(
+    void const *, size_t, void const *, size_t, Realm::Processor) {
+  NOT_IMPLEMENTED();
+}
 
 static Realm::Event register_task(Realm::Processor::Kind target_kind,
                                   task_id_t func_id,
@@ -25,7 +27,8 @@ static Realm::Event register_task(Realm::Processor::Kind target_kind,
 Realm::Event register_all_tasks() {
   std::vector<Realm::Event> pending_registrations;
 
-  std::vector<task_id_t> init_task_ids = {
+  std::vector<task_id_t> task_ids = {
+      // Init tasks
       task_id_t::BATCHNORM_INIT_TASK_ID,
       task_id_t::COMBINE_INIT_TASK_ID,
       task_id_t::CONV2D_INIT_TASK_ID,
@@ -42,11 +45,75 @@ Realm::Event register_all_tasks() {
       task_id_t::REPARTITION_INIT_TASK_ID,
       task_id_t::REPLICATE_INIT_TASK_ID,
       task_id_t::SOFTMAX_INIT_TASK_ID,
+
+      // Forward tasks
+      task_id_t::BATCHMATMUL_FWD_TASK_ID,
+      task_id_t::BATCHNORM_FWD_TASK_ID,
+      task_id_t::BROADCAST_FWD_TASK_ID,
+      task_id_t::CAST_FWD_TASK_ID,
+      task_id_t::COMBINE_FWD_TASK_ID,
+      task_id_t::CONCAT_FWD_TASK_ID,
+      task_id_t::CONV2D_FWD_TASK_ID,
+      task_id_t::DROPOUT_FWD_TASK_ID,
+      task_id_t::ELEMENTBINARY_FWD_TASK_ID,
+      task_id_t::ELEMENTUNARY_FWD_TASK_ID,
+      task_id_t::EMBED_FWD_TASK_ID,
+      task_id_t::FLAT_FWD_TASK_ID,
+      task_id_t::GATHER_FWD_TASK_ID,
+      task_id_t::LAYERNORM_FWD_TASK_ID,
+      task_id_t::LINEAR_FWD_TASK_ID,
+      task_id_t::ATTENTION_FWD_TASK_ID,
+      task_id_t::POOL2D_FWD_TASK_ID,
+      task_id_t::REDUCE_FWD_TASK_ID,
+      task_id_t::REDUCTION_FWD_TASK_ID,
+      task_id_t::REPARTITION_FWD_TASK_ID,
+      task_id_t::REPLICATE_FWD_TASK_ID,
+      task_id_t::RESHAPE_FWD_TASK_ID,
+      task_id_t::REVERSE_FWD_TASK_ID,
+      task_id_t::SOFTMAX_FWD_TASK_ID,
+      task_id_t::SPLIT_FWD_TASK_ID,
+      task_id_t::TOPK_FWD_TASK_ID,
+      task_id_t::TRANSPOSE_FWD_TASK_ID,
+
+      // Backward tasks
+      task_id_t::BATCHMATMUL_BWD_TASK_ID,
+      task_id_t::BATCHNORM_BWD_TASK_ID,
+      task_id_t::BROADCAST_BWD_TASK_ID,
+      task_id_t::CAST_BWD_TASK_ID,
+      task_id_t::COMBINE_BWD_TASK_ID,
+      task_id_t::CONCAT_BWD_TASK_ID,
+      task_id_t::CONV2D_BWD_TASK_ID,
+      task_id_t::DROPOUT_BWD_TASK_ID,
+      task_id_t::ELEMENTBINARY_BWD_TASK_ID,
+      task_id_t::ELEMENTUNARY_BWD_TASK_ID,
+      task_id_t::EMBED_BWD_TASK_ID,
+      task_id_t::FLAT_BWD_TASK_ID,
+      task_id_t::GATHER_BWD_TASK_ID,
+      task_id_t::LAYERNORM_BWD_TASK_ID,
+      task_id_t::LINEAR_BWD_TASK_ID,
+      task_id_t::ATTENTION_BWD_TASK_ID,
+      task_id_t::POOL2D_BWD_TASK_ID,
+      task_id_t::REDUCE_BWD_TASK_ID,
+      task_id_t::REDUCTION_BWD_TASK_ID,
+      task_id_t::REPARTITION_BWD_TASK_ID,
+      task_id_t::REPLICATE_BWD_TASK_ID,
+      task_id_t::RESHAPE_BWD_TASK_ID,
+      task_id_t::REVERSE_BWD_TASK_ID,
+      task_id_t::SOFTMAX_BWD_TASK_ID,
+      task_id_t::SPLIT_BWD_TASK_ID,
+      task_id_t::TOPK_BWD_TASK_ID,
+      task_id_t::TRANSPOSE_BWD_TASK_ID,
+
+      // Update tasks
+      task_id_t::SGD_UPD_NCCL_TASK_ID,
+      task_id_t::ADAM_UPD_NCCL_TASK_ID,
   };
 
-  for (task_id_t init_task_id : init_task_ids) {
+  for (task_id_t task_id : task_ids) {
+    pending_registrations.push_back(register_task(
+        Realm::Processor::LOC_PROC, task_id, operation_task_wrapper));
     pending_registrations.push_back(register_task(
-        Realm::Processor::LOC_PROC, init_task_id, op_task_wrapper));
+        Realm::Processor::TOC_PROC, task_id, operation_task_wrapper));
   }
 
   return Realm::Event::merge_events(pending_registrations);
diff --git a/lib/task-spec/CMakeLists.txt b/lib/task-spec/CMakeLists.txt
index 3c7c91af67..f4f5353f70 100644
--- a/lib/task-spec/CMakeLists.txt
+++ b/lib/task-spec/CMakeLists.txt
@@ -14,6 +14,7 @@ ff_add_library(
     pcg
     spdlog
     compiler
+    Realm::Realm
 )
 
 add_subdirectory(test)
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml
index 89b94b1017..763ebf180f 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml
@@ -14,6 +14,8 @@ includes = [
   "op-attrs/parallel_tensor_space_coordinate.dtg.h",
   "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h",
   "task-spec/dynamic_graph/dynamic_tensor_role.dtg.h",
+  "task-spec/realm/fmt/instance.h",
+  "task-spec/realm/realm.h",
 ]
 
 src_includes = [
@@ -36,6 +38,10 @@ type = "std::optional<::FlexFlow::ParallelTensorSpaceCoordinate>"
 name = "accessor"
 type = "std::optional<::FlexFlow::DynamicTensorAccessor>"
 
+[[fields]]
+name = "instance"
+type = "std::optional<::FlexFlow::Realm::RegionInstance>"
+
 [[fields]]
 name = "role"
 type = "std::optional<::FlexFlow::DynamicTensorRole>"
diff --git a/lib/task-spec/include/task-spec/realm/fmt/instance.h b/lib/task-spec/include/task-spec/realm/fmt/instance.h
new file mode 100644
index 0000000000..23979c7efc
--- /dev/null
+++ b/lib/task-spec/include/task-spec/realm/fmt/instance.h
@@ -0,0 +1,35 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_PAIR_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_PAIR_H
+
+#include "task-spec/realm/realm.h"
+#include "utils/check_fmtable.h"
+#include <fmt/format.h>
+#include <utility>
+
+namespace fmt {
+
+template <typename Char>
+struct formatter<::FlexFlow::Realm::RegionInstance,
+                 Char,
+                 std::enable_if_t<!detail::has_format_as<
+                     ::FlexFlow::Realm::RegionInstance>::value>>
+    : formatter<::std::string> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Realm::RegionInstance const &m,
+              FormatContext &ctx) const -> decltype(ctx.out()) {
+    std::string result = fmt::format("<RegionInstance {}>", m.id);
+
+    return formatter<std::string>::format(result, ctx);
+  }
+};
+
+} // namespace fmt
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s,
+                         ::FlexFlow::Realm::RegionInstance const &m);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm.h b/lib/task-spec/include/task-spec/realm/realm.h
similarity index 63%
rename from lib/realm-execution/include/realm-execution/realm.h
rename to lib/task-spec/include/task-spec/realm/realm.h
index f15113ee92..8123c9e9fa 100644
--- a/lib/realm-execution/include/realm-execution/realm.h
+++ b/lib/task-spec/include/task-spec/realm/realm.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 
 #ifdef FLEXFLOW_USE_PREALM
 #include <realm/prealm/prealm.h>
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
index 4270119612..837ade2aad 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
@@ -23,6 +23,7 @@ LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
       /*parallel_tensor_shape=*/logit_value.parallel_tensor_shape,
       /*shard_coord=*/logit_value.shard_coord,
       /*accessor=*/std::nullopt,
+      /*instance=*/std::nullopt,
       /*role=*/mk_dynamic_tensor_role_loss(),
   };
   DynamicValueAttrs logit_grad_value{
@@ -30,6 +31,7 @@ LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
       /*parallel_tensor_shape=*/logit_value.parallel_tensor_shape,
       /*shard_coord=*/logit_value.shard_coord,
       /*accessor=*/std::nullopt,
+      /*instance=*/std::nullopt,
       /*role=*/mk_dynamic_tensor_role_bwd(),
   };
   DynamicNodeInvocation loss_invocation{
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc
index 204597386e..294241b732 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc
@@ -45,6 +45,7 @@ DynamicOpenDataflowGraph
                       /*parallel_tensor_shape=*/lift_to_parallel(attrs.shape),
                       /*shard_coord=*/std::nullopt,
                       /*accessor=*/std::nullopt,
+                      /*instance=*/std::nullopt,
                       /*role=*/std::nullopt,
                   },
               };
@@ -64,6 +65,7 @@ DynamicOpenDataflowGraph
                       /*parallel_tensor_shape=*/lift_to_parallel(attrs.shape),
                       /*shard_coord=*/std::nullopt,
                       /*accessor=*/std::nullopt,
+                      /*instance=*/std::nullopt,
                       /*role=*/std::nullopt,
                   },
               };
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
index e90ef10398..eceb580a20 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
@@ -44,6 +44,7 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
                             /*parallel_tensor_shape=*/attrs.shape,
                             /*shard_coord=*/std::nullopt,
                             /*accessor=*/std::nullopt,
+                            /*instance=*/std::nullopt,
                             /*role=*/std::nullopt,
                         },
                     };
@@ -64,6 +65,7 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
                             /*parallel_tensor_shape=*/attrs.shape,
                             /*shard_coord=*/std::nullopt,
                             /*accessor=*/std::nullopt,
+                            /*instance=*/std::nullopt,
                             /*role=*/std::nullopt,
                         },
                     };
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc
index 58a32db6c1..23708f3779 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc
@@ -51,6 +51,7 @@ static DynamicNodeInvocation get_update_invocation_for_invocation(
   DynamicValueAttrs value_attrs = output.second;
 
   ASSERT(value_attrs.accessor == std::nullopt);
+  ASSERT(value_attrs.instance == std::nullopt);
 
   DynamicNodeAttrs update_node_attrs = i.node_attrs;
   update_node_attrs.task_type = DynamicTaskType::UPD;
diff --git a/lib/task-spec/src/task-spec/realm/fmt/instance.h b/lib/task-spec/src/task-spec/realm/fmt/instance.h
new file mode 100644
index 0000000000..fa15e1c16f
--- /dev/null
+++ b/lib/task-spec/src/task-spec/realm/fmt/instance.h
@@ -0,0 +1,10 @@
+#include "task-spec/realm/fmt/instance.h"
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s,
+                         ::FlexFlow::Realm::RegionInstance const &m) {
+  return s << fmt::to_string(m);
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/test/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc b/lib/task-spec/test/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
index fc9110b6e4..bb9a45e59a 100644
--- a/lib/task-spec/test/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
+++ b/lib/task-spec/test/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
@@ -16,6 +16,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*parallel_tensor_shape=*/std::nullopt,
         /*shard_coord=*/std::nullopt,
         /*accessor=*/std::nullopt,
+        /*instance=*/std::nullopt,
         /*tensor_type=*/std::nullopt,
     };
 
@@ -29,6 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*parallel_tensor_shape=*/std::nullopt,
         /*shard_coord=*/std::nullopt,
         /*accessor=*/std::nullopt,
+        /*instance=*/std::nullopt,
         /*tensor_type=*/std::nullopt,
     };
 
@@ -42,6 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*parallel_tensor_shape=*/std::nullopt,
         /*shard_coord=*/std::nullopt,
         /*accessor=*/std::nullopt,
+        /*instance=*/std::nullopt,
         /*tensor_type=*/std::nullopt,
     };
 
diff --git a/lib/task-spec/test/src/task-spec/dynamic_graph/machine_slicing.cc b/lib/task-spec/test/src/task-spec/dynamic_graph/machine_slicing.cc
index 40d37f50df..c28e12e0af 100644
--- a/lib/task-spec/test/src/task-spec/dynamic_graph/machine_slicing.cc
+++ b/lib/task-spec/test/src/task-spec/dynamic_graph/machine_slicing.cc
@@ -76,6 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/shard_coord,
           /*accessor=*/std::nullopt,
+          /*instance=*/std::nullopt,
           /*role=*/std::nullopt,
       };
     };
diff --git a/lib/task-spec/test/src/task-spec/dynamic_graph/pass_expansion.cc b/lib/task-spec/test/src/task-spec/dynamic_graph/pass_expansion.cc
index e8fcf2e40b..e57691b475 100644
--- a/lib/task-spec/test/src/task-spec/dynamic_graph/pass_expansion.cc
+++ b/lib/task-spec/test/src/task-spec/dynamic_graph/pass_expansion.cc
@@ -20,6 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/std::nullopt,
           /*accessor=*/std::nullopt,
+          /*instance=*/std::nullopt,
           /*role=*/tensor_role,
       };
     };
@@ -113,6 +114,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/std::nullopt,
           /*accessor=*/std::nullopt,
+          /*instance=*/std::nullopt,
           /*role=*/tensor_role,
       };
     };
@@ -229,6 +231,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/std::nullopt,
           /*accessor=*/std::nullopt,
+          /*instance=*/std::nullopt,
           /*role=*/tensor_type,
       };
     };
diff --git a/lib/task-spec/test/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/test/src/task-spec/dynamic_graph/shard_expansion.cc
index 23fbb6e514..4d88dde805 100644
--- a/lib/task-spec/test/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/test/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -121,6 +121,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/shard_coord,
           /*accessor=*/std::nullopt,
+          /*instance=*/std::nullopt,
           /*role=*/std::nullopt,
       };
     };

From 9066642ce253c050e8a73348a41be2e84d92daa7 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 10:48:32 -0800
Subject: [PATCH 021/113] Fix filename.

---
 lib/task-spec/src/task-spec/realm/fmt/{instance.h => instance.cc} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename lib/task-spec/src/task-spec/realm/fmt/{instance.h => instance.cc} (100%)

diff --git a/lib/task-spec/src/task-spec/realm/fmt/instance.h b/lib/task-spec/src/task-spec/realm/fmt/instance.cc
similarity index 100%
rename from lib/task-spec/src/task-spec/realm/fmt/instance.h
rename to lib/task-spec/src/task-spec/realm/fmt/instance.cc

From 012ff29dee7cd89c6996f9155ef2f71a8aa8de18 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 11:17:12 -0800
Subject: [PATCH 022/113] Some work in instance allocation and
 registry/manager.

---
 .../realm-execution/instance_allocation.h     |  26 +++++
 .../include/realm-execution/realm_context.h   |   2 +
 .../realm-execution/realm_task_registry.h     |   8 ++
 .../realm-execution/instance_allocation.cc    | 104 ++++++++++++++++++
 .../parallel_computation_graph_instance.cc    |  13 +--
 .../src/realm-execution/realm_context.cc      |   6 +
 .../src/realm-execution/realm_manager.cc      |  46 ++++----
 .../realm-execution/realm_task_registry.cc    |  14 +--
 8 files changed, 182 insertions(+), 37 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/instance_allocation.h
 create mode 100644 lib/realm-execution/src/realm-execution/instance_allocation.cc

diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
new file mode 100644
index 0000000000..ea07cf0601
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -0,0 +1,26 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_INSTANCE_ALLOCATION_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_INSTANCE_ALLOCATION_H
+
+#include "realm-execution/realm_context.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+
+namespace FlexFlow {
+
+bool no_instances_are_allocated(DynamicOpenDataflowGraph const &);
+bool all_instances_are_allocated(DynamicOpenDataflowGraph const &);
+
+bool instances_are_ready_for_allocation(DynamicOpenDataflowGraph const &g);
+
+DynamicValueAttrs
+    perform_instance_allocation_for_value(DynamicValueAttrs const &,
+                                          Allocator &);
+
+DynamicOpenDataflowGraph perform_instance_allocation(
+    DynamicOpenDataflowGraph const &,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &preallocated,
+    RealmContext &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 357b05b699..c72fe30b72 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -21,6 +21,8 @@ struct RealmContext {
   device_handle_t const &get_current_device_handle() const;
   device_id_t const &get_current_device_idx() const;
 
+  Realm::Event get_outstanding_events();
+
 protected:
   [[nodiscard]] Realm::Event merge_outstanding_events();
 
diff --git a/lib/realm-execution/include/realm-execution/realm_task_registry.h b/lib/realm-execution/include/realm-execution/realm_task_registry.h
index d9d993795b..d6bf5b927f 100644
--- a/lib/realm-execution/include/realm-execution/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/realm_task_registry.h
@@ -6,6 +6,14 @@
 
 namespace FlexFlow {
 
+[[nodiscard]] Realm::Event register_task(Realm::Processor::Kind target_kind,
+                                         task_id_t func_id,
+                                         void (*task_body)(void const *,
+                                                           size_t,
+                                                           void const *,
+                                                           size_t,
+                                                           Realm::Processor));
+
 [[nodiscard]] Realm::Event register_all_tasks();
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
new file mode 100644
index 0000000000..76d89313a6
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -0,0 +1,104 @@
+#include "realm-execution/instance_allocation.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "utils/bidict/generate_bidict.h"
+#include "utils/containers/all_are_true.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/unordered_set_of.h"
+#include "utils/exception.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+bool no_instances_are_allocated(DynamicOpenDataflowGraph const &g) {
+  return all_are_true(
+      transform(get_dynamic_values(g), [](DynamicValueAttrs const &v) -> bool {
+        return !v.accessor.has_value() && !v.instance.has_value();
+      }));
+}
+
+bool all_instances_are_allocated(DynamicOpenDataflowGraph const &g) {
+  return all_are_true(
+      transform(get_dynamic_values(g), [](DynamicValueAttrs const &v) -> bool {
+        return v.instance.has_value();
+      }));
+}
+
+bool instances_are_ready_for_allocation(DynamicOpenDataflowGraph const &g) {
+  return all_are_true(
+      transform(get_dynamic_values(g), [](DynamicValueAttrs const &v) -> bool {
+        return v.parallel_tensor_shape.has_value();
+      }));
+}
+
+DynamicValueAttrs
+    perform_instance_allocation_for_value(DynamicValueAttrs const &value,
+                                          RealmContext &ctx) {
+  ASSERT(value.accessor == std::nullopt);
+  ASSERT(value.instance == std::nullopt);
+
+  TensorShape shape =
+      get_piece_shape(assert_unwrap(value.parallel_tensor_shape));
+
+  NOT_IMPLEMENTED();
+  // GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+
+  DynamicValueAttrs result = value;
+  // result.accessor = DynamicTensorAccessor{accessor};
+
+  return result;
+}
+
+DynamicOpenDataflowGraph perform_instance_allocation(
+    DynamicOpenDataflowGraph const &g,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &preallocated,
+    RealmContext &ctx) {
+  ASSERT(no_instances_are_allocated(g));
+  ASSERT(instances_are_ready_for_allocation(g));
+  for (DynamicValueAttrs const &v : keys(preallocated)) {
+    ASSERT(v.accessor == std::nullopt);
+    ASSERT(v.instance == std::nullopt);
+  }
+
+  std::unordered_set<DynamicValueAttrs> all_values =
+      unordered_set_of(get_dynamic_values(g));
+
+  bidict<DynamicValueAttrs, DynamicValueAttrs> unallocated_to_allocated =
+      generate_bidict(all_values,
+                      [&](DynamicValueAttrs const &v) -> DynamicValueAttrs {
+                        if (contains_key(preallocated, v)) {
+                          // FIXME: Attach external instance to existing
+                          // allocation and use that
+                          NOT_IMPLEMENTED();
+                        } else {
+                          return perform_instance_allocation_for_value(v, ctx);
+                        }
+                      });
+
+  DynamicOpenDataflowGraph result = transform_dynamic_invocation_set(
+      g, [&](DynamicNodeInvocation const &i) -> DynamicNodeInvocation {
+        return DynamicNodeInvocation{
+            /*inputs=*/map_values(
+                i.inputs,
+                [&](DynamicValueAttrs const &v) -> DynamicValueAttrs {
+                  return unallocated_to_allocated.at_l(v);
+                }),
+            /*node_attrs=*/i.node_attrs,
+            /*outputs=*/
+            map_values(i.outputs,
+                       [&](DynamicValueAttrs const &v) -> DynamicValueAttrs {
+                         return unallocated_to_allocated.at_l(v);
+                       }),
+        };
+      });
+
+  ASSERT(all_instances_are_allocated(result));
+
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index 80ed98f8c2..ec80519cf3 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -1,7 +1,7 @@
 #include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
 #include "local-execution/device_state_initialization.h"
-#include "local-execution/tensor_allocation.h"
 #include "pcg/optimizer_attrs.h"
+#include "realm-execution/instance_allocation.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion.h"
@@ -63,7 +63,7 @@ static GenericTensorAccessorW
 }
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
-    RealmContext &realm,
+    RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
@@ -91,8 +91,7 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
 
   dg = perform_update_insertion(dg, optimizer_attrs);
   dg = perform_shard_expansion(dg);
-  dg = perform_tensor_allocation(
-      dg, inputs, realm.get_current_device_allocator());
+  dg = perform_instance_allocation(dg, inputs, ctx);
 
   std::optional<GenericTensorAccessorW> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
@@ -100,12 +99,12 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
       });
 
   dg = perform_device_state_initialization(dg,
-                                           realm.get_current_device_allocator(),
+                                           ctx.get_current_device_allocator(),
                                            profiling_settings,
-                                           realm.get_current_device_handle(),
+                                           ctx.get_current_device_handle(),
                                            iteration_config,
                                            optimizer_attrs,
-                                           realm.get_current_device_idx());
+                                           ctx.get_current_device_idx());
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 5068373ebe..ede6ae6d8d 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -25,6 +25,12 @@ device_id_t const &RealmContext::get_current_device_idx() const {
   NOT_IMPLEMENTED();
 }
 
+Realm::Event RealmContext::get_outstanding_events() {
+  Realm::Event result = this->merge_outstanding_events();
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
 Realm::Event RealmContext::merge_outstanding_events() {
   Realm::Event result = Realm::Event::merge_events(this->outstanding_events);
   this->outstanding_events.clear();
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 0c34d77204..63c6266948 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -1,21 +1,11 @@
 #include "realm-execution/realm_manager.h"
 #include "realm-execution/realm_task_id_t.h"
+#include "realm-execution/realm_task_registry.h"
 #include "realm-execution/task_id_t.dtg.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
 
-RealmManager::RealmManager(int *argc, char ***argv) {
-  bool ok = this->runtime.init(argc, argv);
-  ASSERT(ok);
-}
-
-RealmManager::~RealmManager() {
-  Realm::Event outstanding = this->merge_outstanding_events();
-  this->runtime.shutdown(outstanding);
-  this->runtime.wait_for_shutdown();
-}
-
 static void controller_task_wrapper(void const *args,
                                     size_t arglen,
                                     void const *userdata,
@@ -29,26 +19,36 @@ static void controller_task_wrapper(void const *args,
   thunk(ctx);
 }
 
+RealmManager::RealmManager(int *argc, char ***argv) {
+  bool ok = this->runtime.init(argc, argv);
+  ASSERT(ok);
+
+  // Register all tasks at initialization time so we don't need to later
+  register_all_tasks().wait();
+  register_task(Realm::Processor::LOC_PROC,
+                task_id_t::CONTROLLER_TASK_ID,
+                controller_task_wrapper)
+      .wait();
+}
+
+RealmManager::~RealmManager() {
+  Realm::Event outstanding = this->merge_outstanding_events();
+  this->runtime.shutdown(outstanding);
+  this->runtime.wait_for_shutdown();
+}
+
 Realm::Event
     RealmManager::start_controller(std::function<void(RealmContext &)> thunk) {
-  Realm::Processor::TaskFuncID CONTROLLER_TASK_ID =
-      get_realm_task_id_for_task_id(task_id_t::CONTROLLER_TASK_ID);
-  Realm::Event task_ready = Realm::Processor::register_task_by_kind(
-      Realm::Processor::LOC_PROC,
-      /*global=*/false,
-      CONTROLLER_TASK_ID,
-      Realm::CodeDescriptor(controller_task_wrapper),
-      Realm::ProfilingRequestSet(),
-      &thunk,
-      sizeof(thunk));
-
   Realm::Processor target_proc =
       Realm::Machine::ProcessorQuery(Realm::Machine::get_machine())
           .only_kind(Realm::Processor::LOC_PROC)
           .first();
 
   Realm::Event task_complete = this->runtime.collective_spawn(
-      target_proc, CONTROLLER_TASK_ID, &thunk, sizeof(thunk), task_ready);
+      target_proc,
+      get_realm_task_id_for_task_id(task_id_t::CONTROLLER_TASK_ID),
+      &thunk,
+      sizeof(thunk));
   this->outstanding_events.push_back(task_complete);
   return task_complete;
 }
diff --git a/lib/realm-execution/src/realm-execution/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/realm_task_registry.cc
index 5c61c208fb..436a6af3f3 100644
--- a/lib/realm-execution/src/realm-execution/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/realm_task_registry.cc
@@ -9,13 +9,13 @@ static void operation_task_wrapper(
   NOT_IMPLEMENTED();
 }
 
-static Realm::Event register_task(Realm::Processor::Kind target_kind,
-                                  task_id_t func_id,
-                                  void (*task_body)(void const *,
-                                                    size_t,
-                                                    void const *,
-                                                    size_t,
-                                                    Realm::Processor)) {
+Realm::Event register_task(Realm::Processor::Kind target_kind,
+                           task_id_t func_id,
+                           void (*task_body)(void const *,
+                                             size_t,
+                                             void const *,
+                                             size_t,
+                                             Realm::Processor)) {
   return Realm::Processor::register_task_by_kind(
       target_kind,
       /*global=*/false,

From b7ca53b813b9ff955f0d652deabe286ad82ddf47 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 12:21:57 -0800
Subject: [PATCH 023/113] Instance allocation.

---
 .../realm-execution/instance_allocation.h     |  2 +-
 .../include/realm-execution/realm_context.h   | 10 ++
 .../realm-execution/instance_allocation.cc    | 16 ++--
 .../parallel_computation_graph_instance.cc    | 18 ++--
 .../src/realm-execution/realm_context.cc      | 93 +++++++++++++++++++
 5 files changed, 124 insertions(+), 15 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
index ea07cf0601..d1dfa3fda0 100644
--- a/lib/realm-execution/include/realm-execution/instance_allocation.h
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -15,7 +15,7 @@ DynamicValueAttrs
     perform_instance_allocation_for_value(DynamicValueAttrs const &,
                                           Allocator &);
 
-DynamicOpenDataflowGraph perform_instance_allocation(
+std::pair<DynamicOpenDataflowGraph, Realm::Event> perform_instance_allocation(
     DynamicOpenDataflowGraph const &,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &preallocated,
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index c72fe30b72..90ef402fb6 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -21,9 +21,19 @@ struct RealmContext {
   device_handle_t const &get_current_device_handle() const;
   device_id_t const &get_current_device_idx() const;
 
+  // Instance management
+  std::pair<Realm::RegionInstance, Realm::Event>
+      create_instance(Realm::Memory memory,
+                      TensorShape const &shape,
+                      Realm::ProfilingRequestSet const &prs,
+                      Realm::Event wait_on = Realm::Event::NO_EVENT);
+
+  // Get the current set of outstanding events
   Realm::Event get_outstanding_events();
 
 protected:
+  // Compact AND CLEAR the outstanding event queue
+  // Important: USER MUST BLOCK on event or else use it, or it WILL BE LOST
   [[nodiscard]] Realm::Event merge_outstanding_events();
 
 protected:
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index 76d89313a6..0870117bfe 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -1,11 +1,13 @@
 #include "realm-execution/instance_allocation.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
 #include "utils/bidict/generate_bidict.h"
 #include "utils/containers/all_are_true.h"
 #include "utils/containers/contains_key.h"
+#include "utils/containers/make.h"
 #include "utils/containers/map_values.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/exception.h"
@@ -40,19 +42,19 @@ DynamicValueAttrs
   ASSERT(value.accessor == std::nullopt);
   ASSERT(value.instance == std::nullopt);
 
-  TensorShape shape =
-      get_piece_shape(assert_unwrap(value.parallel_tensor_shape));
+  TensorShape shape = get_piece_shape(value.parallel_tensor_shape.value());
 
-  NOT_IMPLEMENTED();
-  // GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+  Realm::Memory memory = Realm::Memory::NO_MEMORY; // FIXME
+  auto [instance, ready] =
+      ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
 
   DynamicValueAttrs result = value;
-  // result.accessor = DynamicTensorAccessor{accessor};
+  result.instance = instance;
 
   return result;
 }
 
-DynamicOpenDataflowGraph perform_instance_allocation(
+std::pair<DynamicOpenDataflowGraph, Realm::Event> perform_instance_allocation(
     DynamicOpenDataflowGraph const &g,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &preallocated,
@@ -98,7 +100,7 @@ DynamicOpenDataflowGraph perform_instance_allocation(
 
   ASSERT(all_instances_are_allocated(result));
 
-  return result;
+  return std::pair{result, ctx.get_outstanding_events()};
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index ec80519cf3..dddb624df3 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -53,13 +53,12 @@ std::optional<GenericTensorAccessorR>
   return this->logit_grad_tensor;
 }
 
-static GenericTensorAccessorW
-    get_loss_tensor_accessor(DynamicOpenDataflowGraph const &dg,
+static Realm::RegionInstance
+    get_loss_tensor_instance(DynamicOpenDataflowGraph const &dg,
                              DynamicValueAttrs const &value) {
   return find_output_tensor(dg, value.tensor_guid, value.role)
       .value()
-      .second.accessor.value()
-      .get<GenericTensorAccessorW>();
+      .second.instance.value();
 }
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
@@ -91,11 +90,16 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
 
   dg = perform_update_insertion(dg, optimizer_attrs);
   dg = perform_shard_expansion(dg);
-  dg = perform_instance_allocation(dg, inputs, ctx);
+  Realm::Event instances_ready;
+  {
+    auto [dg2, ready] = perform_instance_allocation(dg, inputs, ctx);
+    dg = dg2;
+    instances_ready = ready;
+  }
 
-  std::optional<GenericTensorAccessorW> logit_grad_tensor =
+  std::optional<Realm::RegionInstance> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
-        return get_loss_tensor_accessor(dg, lgv);
+        return get_loss_tensor_instance(dg, lgv);
       });
 
   dg = perform_device_state_initialization(dg,
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index ede6ae6d8d..6ab7f992fa 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -1,7 +1,9 @@
 #include "realm-execution/realm_context.h"
+#include "op-attrs/datatype.h"
 #include "realm-execution/realm_task_id_t.h"
 #include "realm-execution/task_id_t.dtg.h"
 #include "utils/exception.h"
+#include "utils/positive_int/positive_int.h"
 
 namespace FlexFlow {
 
@@ -25,6 +27,97 @@ device_id_t const &RealmContext::get_current_device_idx() const {
   NOT_IMPLEMENTED();
 }
 
+std::pair<Realm::RegionInstance, Realm::Event>
+    RealmContext::create_instance(Realm::Memory memory,
+                                  TensorShape const &shape,
+                                  Realm::ProfilingRequestSet const &prs,
+                                  Realm::Event wait_on) {
+  std::vector<int> dims{shape.dims.ff_ordered.begin(),
+                        shape.dims.ff_ordered.end()};
+  std::vector<size_t> field_sizes{
+      static_cast<size_t>(int{size_of_datatype(shape.data_type)})};
+  Realm::RegionInstance inst;
+  Realm::Event ready;
+  switch (shape.dims.ff_ordered.num_dims()) {
+#if REALM_MAX_DIM >= 1
+    case 1:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          Realm::Rect<1>(Realm::Point<1>::ZEROES(),
+                         Realm::Point<1>(dims.data()) -
+                             Realm::Point<1>::ONES()),
+          field_sizes,
+          /*block_size=*/0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 2
+    case 2:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          Realm::Rect<2>(Realm::Point<2>::ZEROES(),
+                         Realm::Point<2>(dims.data()) -
+                             Realm::Point<2>::ONES()),
+          field_sizes,
+          /*block_size=*/0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 3
+    case 3:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          Realm::Rect<3>(Realm::Point<3>::ZEROES(),
+                         Realm::Point<3>(dims.data()) -
+                             Realm::Point<3>::ONES()),
+          field_sizes,
+          /*block_size=*/0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 4
+    case 4:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          Realm::Rect<4>(Realm::Point<4>::ZEROES(),
+                         Realm::Point<4>(dims.data()) -
+                             Realm::Point<4>::ONES()),
+          field_sizes,
+          /*block_size=*/0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 5
+    case 5:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          Realm::Rect<5>(Realm::Point<5>::ZEROES(),
+                         Realm::Point<5>(dims.data()) -
+                             Realm::Point<5>::ONES()),
+          field_sizes,
+          /*block_size=*/0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+    default:
+      PANIC("TensorShape dims greater than REALM_MAX_DIM",
+            fmt::to_string(shape.dims.ff_ordered.num_dims()));
+      break;
+  }
+  this->outstanding_events.push_back(ready);
+  return std::pair{inst, ready};
+}
+
 Realm::Event RealmContext::get_outstanding_events() {
   Realm::Event result = this->merge_outstanding_events();
   this->outstanding_events.push_back(result);

From 84c205a24f52a980018dd9e3260072a96c1507cf Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 12:26:48 -0800
Subject: [PATCH 024/113] Simplify dims and use constructors.

---
 .../src/realm-execution/realm_context.cc      | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 6ab7f992fa..4890eb4a5d 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -38,15 +38,15 @@ std::pair<Realm::RegionInstance, Realm::Event>
       static_cast<size_t>(int{size_of_datatype(shape.data_type)})};
   Realm::RegionInstance inst;
   Realm::Event ready;
-  switch (shape.dims.ff_ordered.num_dims()) {
+  switch (dims.size()) {
 #if REALM_MAX_DIM >= 1
     case 1:
       ready = Realm::RegionInstance::create_instance(
           inst,
           memory,
-          Realm::Rect<1>(Realm::Point<1>::ZEROES(),
-                         Realm::Point<1>(dims.data()) -
-                             Realm::Point<1>::ONES()),
+          Realm::Rect<1>{Realm::Point<1>::ZEROES(),
+                         Realm::Point<1>{dims.data()} -
+                             Realm::Point<1>::ONES()},
           field_sizes,
           /*block_size=*/0 /*SOA*/,
           prs,
@@ -58,9 +58,9 @@ std::pair<Realm::RegionInstance, Realm::Event>
       ready = Realm::RegionInstance::create_instance(
           inst,
           memory,
-          Realm::Rect<2>(Realm::Point<2>::ZEROES(),
-                         Realm::Point<2>(dims.data()) -
-                             Realm::Point<2>::ONES()),
+          Realm::Rect<2>{Realm::Point<2>::ZEROES(),
+                         Realm::Point<2>{dims.data()} -
+                             Realm::Point<2>::ONES()},
           field_sizes,
           /*block_size=*/0 /*SOA*/,
           prs,
@@ -72,9 +72,9 @@ std::pair<Realm::RegionInstance, Realm::Event>
       ready = Realm::RegionInstance::create_instance(
           inst,
           memory,
-          Realm::Rect<3>(Realm::Point<3>::ZEROES(),
-                         Realm::Point<3>(dims.data()) -
-                             Realm::Point<3>::ONES()),
+          Realm::Rect<3>{Realm::Point<3>::ZEROES(),
+                         Realm::Point<3>{dims.data()} -
+                             Realm::Point<3>::ONES()},
           field_sizes,
           /*block_size=*/0 /*SOA*/,
           prs,
@@ -86,9 +86,9 @@ std::pair<Realm::RegionInstance, Realm::Event>
       ready = Realm::RegionInstance::create_instance(
           inst,
           memory,
-          Realm::Rect<4>(Realm::Point<4>::ZEROES(),
-                         Realm::Point<4>(dims.data()) -
-                             Realm::Point<4>::ONES()),
+          Realm::Rect<4>{Realm::Point<4>::ZEROES(),
+                         Realm::Point<4>{dims.data()} -
+                             Realm::Point<4>::ONES()},
           field_sizes,
           /*block_size=*/0 /*SOA*/,
           prs,
@@ -100,9 +100,9 @@ std::pair<Realm::RegionInstance, Realm::Event>
       ready = Realm::RegionInstance::create_instance(
           inst,
           memory,
-          Realm::Rect<5>(Realm::Point<5>::ZEROES(),
-                         Realm::Point<5>(dims.data()) -
-                             Realm::Point<5>::ONES()),
+          Realm::Rect<5>{Realm::Point<5>::ZEROES(),
+                         Realm::Point<5>{dims.data()} -
+                             Realm::Point<5>::ONES()},
           field_sizes,
           /*block_size=*/0 /*SOA*/,
           prs,
@@ -111,7 +111,7 @@ std::pair<Realm::RegionInstance, Realm::Event>
 #endif
     default:
       PANIC("TensorShape dims greater than REALM_MAX_DIM",
-            fmt::to_string(shape.dims.ff_ordered.num_dims()));
+            fmt::to_string(dims.size()));
       break;
   }
   this->outstanding_events.push_back(ready);

From 24ca59bc5fa90770df0130f271d459b6a1623e96 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 13:24:41 -0800
Subject: [PATCH 025/113] Refactor.

---
 .../src/realm-execution/realm_context.cc      | 105 +++++++++---------
 1 file changed, 51 insertions(+), 54 deletions(-)

diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 4890eb4a5d..b2671f709e 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -1,5 +1,6 @@
 #include "realm-execution/realm_context.h"
 #include "op-attrs/datatype.h"
+#include "op-attrs/tensor_dims.dtg.h"
 #include "realm-execution/realm_task_id_t.h"
 #include "realm-execution/task_id_t.dtg.h"
 #include "utils/exception.h"
@@ -27,91 +28,87 @@ device_id_t const &RealmContext::get_current_device_idx() const {
   NOT_IMPLEMENTED();
 }
 
+template <int N>
+static Realm::Rect<N> rect_from_dims(TensorDims const &dims) {
+  std::vector<int> values{dims.ff_ordered.begin(), dims.ff_ordered.end()};
+  return Realm::Rect<N>{Realm::Point<N>::ZEROES(),
+                        Realm::Point<N>{values.data()} -
+                            Realm::Point<N>::ONES()};
+}
+
 std::pair<Realm::RegionInstance, Realm::Event>
     RealmContext::create_instance(Realm::Memory memory,
                                   TensorShape const &shape,
                                   Realm::ProfilingRequestSet const &prs,
                                   Realm::Event wait_on) {
-  std::vector<int> dims{shape.dims.ff_ordered.begin(),
-                        shape.dims.ff_ordered.end()};
   std::vector<size_t> field_sizes{
       static_cast<size_t>(int{size_of_datatype(shape.data_type)})};
   Realm::RegionInstance inst;
   Realm::Event ready;
-  switch (dims.size()) {
+  switch (shape.dims.ff_ordered.num_dims()) {
 #if REALM_MAX_DIM >= 1
     case 1:
-      ready = Realm::RegionInstance::create_instance(
-          inst,
-          memory,
-          Realm::Rect<1>{Realm::Point<1>::ZEROES(),
-                         Realm::Point<1>{dims.data()} -
-                             Realm::Point<1>::ONES()},
-          field_sizes,
-          /*block_size=*/0 /*SOA*/,
-          prs,
-          wait_on);
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<1>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
       break;
 #endif
 #if REALM_MAX_DIM >= 2
     case 2:
-      ready = Realm::RegionInstance::create_instance(
-          inst,
-          memory,
-          Realm::Rect<2>{Realm::Point<2>::ZEROES(),
-                         Realm::Point<2>{dims.data()} -
-                             Realm::Point<2>::ONES()},
-          field_sizes,
-          /*block_size=*/0 /*SOA*/,
-          prs,
-          wait_on);
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<2>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
       break;
 #endif
 #if REALM_MAX_DIM >= 3
     case 3:
-      ready = Realm::RegionInstance::create_instance(
-          inst,
-          memory,
-          Realm::Rect<3>{Realm::Point<3>::ZEROES(),
-                         Realm::Point<3>{dims.data()} -
-                             Realm::Point<3>::ONES()},
-          field_sizes,
-          /*block_size=*/0 /*SOA*/,
-          prs,
-          wait_on);
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<3>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
       break;
 #endif
 #if REALM_MAX_DIM >= 4
     case 4:
-      ready = Realm::RegionInstance::create_instance(
-          inst,
-          memory,
-          Realm::Rect<4>{Realm::Point<4>::ZEROES(),
-                         Realm::Point<4>{dims.data()} -
-                             Realm::Point<4>::ONES()},
-          field_sizes,
-          /*block_size=*/0 /*SOA*/,
-          prs,
-          wait_on);
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<4>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
       break;
 #endif
 #if REALM_MAX_DIM >= 5
     case 5:
-      ready = Realm::RegionInstance::create_instance(
-          inst,
-          memory,
-          Realm::Rect<5>{Realm::Point<5>::ZEROES(),
-                         Realm::Point<5>{dims.data()} -
-                             Realm::Point<5>::ONES()},
-          field_sizes,
-          /*block_size=*/0 /*SOA*/,
-          prs,
-          wait_on);
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<5>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
       break;
 #endif
     default:
       PANIC("TensorShape dims greater than REALM_MAX_DIM",
-            fmt::to_string(dims.size()));
+            fmt::to_string(shape.dims.ff_ordered.num_dims()));
       break;
   }
   this->outstanding_events.push_back(ready);

From 968ce9cd1273be1a156252c0d63de9d17c06ad32 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 14:39:40 -0800
Subject: [PATCH 026/113] Sketch out device mapping.

---
 .../include/realm-execution/realm_context.h   |  5 +++
 .../realm-execution/instance_allocation.cc    | 41 +++++++++++--------
 .../src/realm-execution/realm_context.cc      |  9 ++++
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 90ef402fb6..6ba64338c9 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -16,6 +16,11 @@ struct RealmContext {
   RealmContext(RealmContext const &) = delete;
   RealmContext(RealmContext &&) = delete;
 
+  // Device mapping
+  Realm::Processor
+      map_device_coord_to_processor(MachineSpaceCoordinate const &);
+  Realm::Memory get_nearest_memory(Realm::Processor) const;
+
   // Current device context
   Allocator &get_current_device_allocator() const;
   device_handle_t const &get_current_device_handle() const;
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index 0870117bfe..33b7b54937 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -2,8 +2,10 @@
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include "realm-execution/realm_context.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "utils/bidict/generate_bidict.h"
 #include "utils/containers/all_are_true.h"
 #include "utils/containers/contains_key.h"
@@ -37,14 +39,17 @@ bool instances_are_ready_for_allocation(DynamicOpenDataflowGraph const &g) {
 }
 
 DynamicValueAttrs
-    perform_instance_allocation_for_value(DynamicValueAttrs const &value,
+    perform_instance_allocation_for_value(DynamicNodeAttrs const &node,
+                                          DynamicValueAttrs const &value,
                                           RealmContext &ctx) {
   ASSERT(value.accessor == std::nullopt);
   ASSERT(value.instance == std::nullopt);
 
   TensorShape shape = get_piece_shape(value.parallel_tensor_shape.value());
 
-  Realm::Memory memory = Realm::Memory::NO_MEMORY; // FIXME
+  MachineSpaceCoordinate device_coord = assert_unwrap(node.device_coord);
+  Realm::Processor proc = ctx.map_device_coord_to_processor(device_coord);
+  Realm::Memory memory = ctx.get_nearest_memory(proc);
   auto [instance, ready] =
       ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
 
@@ -66,20 +71,20 @@ std::pair<DynamicOpenDataflowGraph, Realm::Event> perform_instance_allocation(
     ASSERT(v.instance == std::nullopt);
   }
 
-  std::unordered_set<DynamicValueAttrs> all_values =
-      unordered_set_of(get_dynamic_values(g));
-
-  bidict<DynamicValueAttrs, DynamicValueAttrs> unallocated_to_allocated =
-      generate_bidict(all_values,
-                      [&](DynamicValueAttrs const &v) -> DynamicValueAttrs {
-                        if (contains_key(preallocated, v)) {
-                          // FIXME: Attach external instance to existing
-                          // allocation and use that
-                          NOT_IMPLEMENTED();
-                        } else {
-                          return perform_instance_allocation_for_value(v, ctx);
-                        }
-                      });
+  bidict<DynamicValueAttrs, DynamicValueAttrs> unallocated_to_allocated;
+  auto allocate = [&](DynamicNodeAttrs const &n, DynamicValueAttrs const &v) {
+    if (contains_key(preallocated, v)) {
+      // FIXME: Attach external instance to existing allocation and use that
+      NOT_IMPLEMENTED();
+    } else {
+      if (contains_key(unallocated_to_allocated, v)) {
+        return unallocated_to_allocated.at_l(v);
+      } else {
+        DynamicValueAttrs v2 = perform_instance_allocation_for_value(n, v, ctx);
+        uallocated_to_allocated.equate(v, v2);
+      }
+    }
+  };
 
   DynamicOpenDataflowGraph result = transform_dynamic_invocation_set(
       g, [&](DynamicNodeInvocation const &i) -> DynamicNodeInvocation {
@@ -87,13 +92,13 @@ std::pair<DynamicOpenDataflowGraph, Realm::Event> perform_instance_allocation(
             /*inputs=*/map_values(
                 i.inputs,
                 [&](DynamicValueAttrs const &v) -> DynamicValueAttrs {
-                  return unallocated_to_allocated.at_l(v);
+                  return allocate(i.node_attrs, v);
                 }),
             /*node_attrs=*/i.node_attrs,
             /*outputs=*/
             map_values(i.outputs,
                        [&](DynamicValueAttrs const &v) -> DynamicValueAttrs {
-                         return unallocated_to_allocated.at_l(v);
+                         return allocate(i.node_attrs, v);
                        }),
         };
       });
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index b2671f709e..30343652d7 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -17,6 +17,15 @@ RealmContext::~RealmContext() {
   }
 }
 
+Realm::Processor RealmContext::map_device_coord_to_processor(
+    MachineSpaceCoordinate const &device_coord) {
+  NOT_IMPLEMENTED();
+}
+
+Realm::Memory get_nearest_memory(Realm::Processor proc) const {
+  NOT_IMPLEMENTED();
+}
+
 Allocator &RealmContext::get_current_device_allocator() const {
   NOT_IMPLEMENTED();
 }

From 51de0110cc8f8b9665caea71766ffbeb9d168fb5 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 15:53:27 -0800
Subject: [PATCH 027/113] Move instance backing to a separate map, remove realm
 from task-spec.

---
 .../include/realm-execution}/fmt/instance.h   |  6 +-
 .../realm-execution/instance_allocation.h     |  8 +--
 .../include/realm-execution}/realm.h          |  0
 .../include/realm-execution/realm_context.h   |  3 +-
 .../include/realm-execution/realm_manager.h   |  2 +-
 .../include/realm-execution/realm_task_id_t.h |  2 +-
 .../realm-execution/realm_task_registry.h     |  2 +-
 .../tensor_instance_backing.dtg.toml          | 24 +++++++
 .../realm-execution/tensor_instance_backing.h | 12 ++++
 .../src/realm-execution}/fmt/instance.cc      |  2 +-
 .../realm-execution/instance_allocation.cc    | 72 ++++---------------
 .../parallel_computation_graph_instance.cc    | 17 +----
 .../src/realm-execution/realm_context.cc      |  2 +-
 .../tensor_instance_backing.cc                | 11 +++
 lib/task-spec/CMakeLists.txt                  |  1 -
 .../dynamic_value_attrs.dtg.toml              |  6 --
 .../task-spec/dynamic_graph/loss_insertion.cc |  2 -
 ...ake_dynamic_open_dataflow_graph_from_cg.cc |  2 -
 ...e_dynamic_open_dataflow_graph_from_mpcg.cc |  2 -
 .../dynamic_graph/update_insertion.cc         |  1 -
 .../dynamic_open_dataflow_graph.cc            |  3 -
 .../dynamic_graph/machine_slicing.cc          |  1 -
 .../task-spec/dynamic_graph/pass_expansion.cc |  3 -
 .../dynamic_graph/shard_expansion.cc          |  1 -
 24 files changed, 74 insertions(+), 111 deletions(-)
 rename lib/{task-spec/include/task-spec/realm => realm-execution/include/realm-execution}/fmt/instance.h (83%)
 rename lib/{task-spec/include/task-spec/realm => realm-execution/include/realm-execution}/realm.h (100%)
 create mode 100644 lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tensor_instance_backing.h
 rename lib/{task-spec/src/task-spec/realm => realm-execution/src/realm-execution}/fmt/instance.cc (82%)
 create mode 100644 lib/realm-execution/src/realm-execution/tensor_instance_backing.cc

diff --git a/lib/task-spec/include/task-spec/realm/fmt/instance.h b/lib/realm-execution/include/realm-execution/fmt/instance.h
similarity index 83%
rename from lib/task-spec/include/task-spec/realm/fmt/instance.h
rename to lib/realm-execution/include/realm-execution/fmt/instance.h
index 23979c7efc..b2efc59b7d 100644
--- a/lib/task-spec/include/task-spec/realm/fmt/instance.h
+++ b/lib/realm-execution/include/realm-execution/fmt/instance.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_PAIR_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_PAIR_H
 
-#include "task-spec/realm/realm.h"
+#include "realm-execution/realm.h"
 #include "utils/check_fmtable.h"
 #include <fmt/format.h>
 #include <utility>
@@ -15,8 +15,8 @@ struct formatter<::FlexFlow::Realm::RegionInstance,
                      ::FlexFlow::Realm::RegionInstance>::value>>
     : formatter<::std::string> {
   template <typename FormatContext>
-  auto format(::FlexFlow::Realm::RegionInstance const &m,
-              FormatContext &ctx) const -> decltype(ctx.out()) {
+  auto format(::FlexFlow::Realm::RegionInstance const &m, FormatContext &ctx)
+      -> decltype(ctx.out()) {
     std::string result = fmt::format("<RegionInstance {}>", m.id);
 
     return formatter<std::string>::format(result, ctx);
diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
index d1dfa3fda0..59065694e9 100644
--- a/lib/realm-execution/include/realm-execution/instance_allocation.h
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -2,20 +2,16 @@
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_INSTANCE_ALLOCATION_H
 
 #include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 
 namespace FlexFlow {
 
-bool no_instances_are_allocated(DynamicOpenDataflowGraph const &);
-bool all_instances_are_allocated(DynamicOpenDataflowGraph const &);
-
-bool instances_are_ready_for_allocation(DynamicOpenDataflowGraph const &g);
-
 DynamicValueAttrs
     perform_instance_allocation_for_value(DynamicValueAttrs const &,
                                           Allocator &);
 
-std::pair<DynamicOpenDataflowGraph, Realm::Event> perform_instance_allocation(
+TensorInstanceBacking perform_instance_allocation(
     DynamicOpenDataflowGraph const &,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &preallocated,
diff --git a/lib/task-spec/include/task-spec/realm/realm.h b/lib/realm-execution/include/realm-execution/realm.h
similarity index 100%
rename from lib/task-spec/include/task-spec/realm/realm.h
rename to lib/realm-execution/include/realm-execution/realm.h
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 6ba64338c9..bfc1a53cd3 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -4,7 +4,8 @@
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
 #include "pcg/device_id_t.dtg.h"
-#include "task-spec/realm/realm.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "realm-execution/realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index ebf3bb401e..bf5e8f72f1 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -4,8 +4,8 @@
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
 #include "pcg/device_id_t.dtg.h"
+#include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
-#include "task-spec/realm/realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/realm_task_id_t.h b/lib/realm-execution/include/realm-execution/realm_task_id_t.h
index 327cf9ffd0..8e6da1a2bd 100644
--- a/lib/realm-execution/include/realm-execution/realm_task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/realm_task_id_t.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
 
+#include "realm-execution/realm.h"
 #include "realm-execution/task_id_t.dtg.h"
-#include "task-spec/realm/realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/realm_task_registry.h b/lib/realm-execution/include/realm-execution/realm_task_registry.h
index d6bf5b927f..f800b1d8c4 100644
--- a/lib/realm-execution/include/realm-execution/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/realm_task_registry.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
 
+#include "realm-execution/realm.h"
 #include "realm-execution/task_id_t.dtg.h"
-#include "task-spec/realm/realm.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
new file mode 100644
index 0000000000..bdf08df59c
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -0,0 +1,24 @@
+namespace = "FlexFlow"
+name = "TensorInstanceBacking"
+type = "struct"
+features = [
+  "eq",
+  #"fmt",
+  "hash",
+]
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/realm.h",
+  "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "realm-execution/fmt/instance.h",
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::DynamicValueAttrs, std::pair<::FlexFlow::Realm::RegionInstance, ::FlexFlow::Realm::Event>>"
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.h b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
new file mode 100644
index 0000000000..1d143b7409
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TENSOR_INSTANCE_BACKING_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TENSOR_INSTANCE_BACKING_H
+
+#include "realm-execution/tensor_instance_backing.dtg.h"
+
+namespace FlexFlow {
+
+TensorInstanceBacking make_empty_tensor_instance_backing();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/src/task-spec/realm/fmt/instance.cc b/lib/realm-execution/src/realm-execution/fmt/instance.cc
similarity index 82%
rename from lib/task-spec/src/task-spec/realm/fmt/instance.cc
rename to lib/realm-execution/src/realm-execution/fmt/instance.cc
index fa15e1c16f..f8eabe9bb0 100644
--- a/lib/task-spec/src/task-spec/realm/fmt/instance.cc
+++ b/lib/realm-execution/src/realm-execution/fmt/instance.cc
@@ -1,4 +1,4 @@
-#include "task-spec/realm/fmt/instance.h"
+#include "realm-execution/fmt/instance.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index 33b7b54937..c033f0bac1 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -1,7 +1,9 @@
 #include "realm-execution/instance_allocation.h"
+#include "local-execution/tensor_allocation.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
@@ -17,95 +19,47 @@
 
 namespace FlexFlow {
 
-bool no_instances_are_allocated(DynamicOpenDataflowGraph const &g) {
-  return all_are_true(
-      transform(get_dynamic_values(g), [](DynamicValueAttrs const &v) -> bool {
-        return !v.accessor.has_value() && !v.instance.has_value();
-      }));
-}
-
-bool all_instances_are_allocated(DynamicOpenDataflowGraph const &g) {
-  return all_are_true(
-      transform(get_dynamic_values(g), [](DynamicValueAttrs const &v) -> bool {
-        return v.instance.has_value();
-      }));
-}
-
-bool instances_are_ready_for_allocation(DynamicOpenDataflowGraph const &g) {
-  return all_are_true(
-      transform(get_dynamic_values(g), [](DynamicValueAttrs const &v) -> bool {
-        return v.parallel_tensor_shape.has_value();
-      }));
-}
-
-DynamicValueAttrs
+std::pair<Realm::RegionInstance, Realm::Event>
     perform_instance_allocation_for_value(DynamicNodeAttrs const &node,
                                           DynamicValueAttrs const &value,
                                           RealmContext &ctx) {
   ASSERT(value.accessor == std::nullopt);
-  ASSERT(value.instance == std::nullopt);
 
   TensorShape shape = get_piece_shape(value.parallel_tensor_shape.value());
 
   MachineSpaceCoordinate device_coord = assert_unwrap(node.device_coord);
   Realm::Processor proc = ctx.map_device_coord_to_processor(device_coord);
   Realm::Memory memory = ctx.get_nearest_memory(proc);
-  auto [instance, ready] =
-      ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
-
-  DynamicValueAttrs result = value;
-  result.instance = instance;
-
-  return result;
+  return ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
 }
 
-std::pair<DynamicOpenDataflowGraph, Realm::Event> perform_instance_allocation(
+TensorInstanceBacking perform_instance_allocation(
     DynamicOpenDataflowGraph const &g,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &preallocated,
     RealmContext &ctx) {
-  ASSERT(no_instances_are_allocated(g));
-  ASSERT(instances_are_ready_for_allocation(g));
+  ASSERT(no_tensors_are_allocated(g));
+  ASSERT(tensors_are_ready_for_allocation(g));
   for (DynamicValueAttrs const &v : keys(preallocated)) {
     ASSERT(v.accessor == std::nullopt);
-    ASSERT(v.instance == std::nullopt);
   }
 
-  bidict<DynamicValueAttrs, DynamicValueAttrs> unallocated_to_allocated;
+  TensorInstanceBacking result = make_empty_tensor_instance_backing();
   auto allocate = [&](DynamicNodeAttrs const &n, DynamicValueAttrs const &v) {
     if (contains_key(preallocated, v)) {
       // FIXME: Attach external instance to existing allocation and use that
       NOT_IMPLEMENTED();
     } else {
-      if (contains_key(unallocated_to_allocated, v)) {
-        return unallocated_to_allocated.at_l(v);
+      if (contains_key(result.backing, v)) {
+        return result.backing.at(v);
       } else {
-        DynamicValueAttrs v2 = perform_instance_allocation_for_value(n, v, ctx);
-        uallocated_to_allocated.equate(v, v2);
+        result.backing.insert(
+            std::pair{v, perform_instance_allocation_for_value(n, v, ctx)});
       }
     }
   };
 
-  DynamicOpenDataflowGraph result = transform_dynamic_invocation_set(
-      g, [&](DynamicNodeInvocation const &i) -> DynamicNodeInvocation {
-        return DynamicNodeInvocation{
-            /*inputs=*/map_values(
-                i.inputs,
-                [&](DynamicValueAttrs const &v) -> DynamicValueAttrs {
-                  return allocate(i.node_attrs, v);
-                }),
-            /*node_attrs=*/i.node_attrs,
-            /*outputs=*/
-            map_values(i.outputs,
-                       [&](DynamicValueAttrs const &v) -> DynamicValueAttrs {
-                         return allocate(i.node_attrs, v);
-                       }),
-        };
-      });
-
-  ASSERT(all_instances_are_allocated(result));
-
-  return std::pair{result, ctx.get_outstanding_events()};
+  return result;
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index dddb624df3..e0e4f769d3 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -53,14 +53,6 @@ std::optional<GenericTensorAccessorR>
   return this->logit_grad_tensor;
 }
 
-static Realm::RegionInstance
-    get_loss_tensor_instance(DynamicOpenDataflowGraph const &dg,
-                             DynamicValueAttrs const &value) {
-  return find_output_tensor(dg, value.tensor_guid, value.role)
-      .value()
-      .second.instance.value();
-}
-
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
     RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
@@ -90,16 +82,11 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
 
   dg = perform_update_insertion(dg, optimizer_attrs);
   dg = perform_shard_expansion(dg);
-  Realm::Event instances_ready;
-  {
-    auto [dg2, ready] = perform_instance_allocation(dg, inputs, ctx);
-    dg = dg2;
-    instances_ready = ready;
-  }
+  TensorInstanceBacking backing = perform_instance_allocation(dg, inputs, ctx);
 
   std::optional<Realm::RegionInstance> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
-        return get_loss_tensor_instance(dg, lgv);
+        return backing.backing.at(lgv).first;
       });
 
   dg = perform_device_state_initialization(dg,
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 30343652d7..4c02c13aa0 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -22,7 +22,7 @@ Realm::Processor RealmContext::map_device_coord_to_processor(
   NOT_IMPLEMENTED();
 }
 
-Realm::Memory get_nearest_memory(Realm::Processor proc) const {
+Realm::Memory RealmContext::get_nearest_memory(Realm::Processor proc) const {
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc b/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
new file mode 100644
index 0000000000..53c2a2b271
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
@@ -0,0 +1,11 @@
+#include "realm-execution/tensor_instance_backing.h"
+
+namespace FlexFlow {
+
+TensorInstanceBacking make_empty_tensor_instance_backing() {
+  return TensorInstanceBacking{
+      /*backing=*/{},
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/CMakeLists.txt b/lib/task-spec/CMakeLists.txt
index f4f5353f70..3c7c91af67 100644
--- a/lib/task-spec/CMakeLists.txt
+++ b/lib/task-spec/CMakeLists.txt
@@ -14,7 +14,6 @@ ff_add_library(
     pcg
     spdlog
     compiler
-    Realm::Realm
 )
 
 add_subdirectory(test)
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml
index 763ebf180f..89b94b1017 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml
@@ -14,8 +14,6 @@ includes = [
   "op-attrs/parallel_tensor_space_coordinate.dtg.h",
   "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h",
   "task-spec/dynamic_graph/dynamic_tensor_role.dtg.h",
-  "task-spec/realm/fmt/instance.h",
-  "task-spec/realm/realm.h",
 ]
 
 src_includes = [
@@ -38,10 +36,6 @@ type = "std::optional<::FlexFlow::ParallelTensorSpaceCoordinate>"
 name = "accessor"
 type = "std::optional<::FlexFlow::DynamicTensorAccessor>"
 
-[[fields]]
-name = "instance"
-type = "std::optional<::FlexFlow::Realm::RegionInstance>"
-
 [[fields]]
 name = "role"
 type = "std::optional<::FlexFlow::DynamicTensorRole>"
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
index 837ade2aad..4270119612 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
@@ -23,7 +23,6 @@ LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
       /*parallel_tensor_shape=*/logit_value.parallel_tensor_shape,
       /*shard_coord=*/logit_value.shard_coord,
       /*accessor=*/std::nullopt,
-      /*instance=*/std::nullopt,
       /*role=*/mk_dynamic_tensor_role_loss(),
   };
   DynamicValueAttrs logit_grad_value{
@@ -31,7 +30,6 @@ LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
       /*parallel_tensor_shape=*/logit_value.parallel_tensor_shape,
       /*shard_coord=*/logit_value.shard_coord,
       /*accessor=*/std::nullopt,
-      /*instance=*/std::nullopt,
       /*role=*/mk_dynamic_tensor_role_bwd(),
   };
   DynamicNodeInvocation loss_invocation{
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc
index 294241b732..204597386e 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc
@@ -45,7 +45,6 @@ DynamicOpenDataflowGraph
                       /*parallel_tensor_shape=*/lift_to_parallel(attrs.shape),
                       /*shard_coord=*/std::nullopt,
                       /*accessor=*/std::nullopt,
-                      /*instance=*/std::nullopt,
                       /*role=*/std::nullopt,
                   },
               };
@@ -65,7 +64,6 @@ DynamicOpenDataflowGraph
                       /*parallel_tensor_shape=*/lift_to_parallel(attrs.shape),
                       /*shard_coord=*/std::nullopt,
                       /*accessor=*/std::nullopt,
-                      /*instance=*/std::nullopt,
                       /*role=*/std::nullopt,
                   },
               };
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
index eceb580a20..e90ef10398 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
@@ -44,7 +44,6 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
                             /*parallel_tensor_shape=*/attrs.shape,
                             /*shard_coord=*/std::nullopt,
                             /*accessor=*/std::nullopt,
-                            /*instance=*/std::nullopt,
                             /*role=*/std::nullopt,
                         },
                     };
@@ -65,7 +64,6 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
                             /*parallel_tensor_shape=*/attrs.shape,
                             /*shard_coord=*/std::nullopt,
                             /*accessor=*/std::nullopt,
-                            /*instance=*/std::nullopt,
                             /*role=*/std::nullopt,
                         },
                     };
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc
index 23708f3779..58a32db6c1 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc
@@ -51,7 +51,6 @@ static DynamicNodeInvocation get_update_invocation_for_invocation(
   DynamicValueAttrs value_attrs = output.second;
 
   ASSERT(value_attrs.accessor == std::nullopt);
-  ASSERT(value_attrs.instance == std::nullopt);
 
   DynamicNodeAttrs update_node_attrs = i.node_attrs;
   update_node_attrs.task_type = DynamicTaskType::UPD;
diff --git a/lib/task-spec/test/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc b/lib/task-spec/test/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
index bb9a45e59a..fc9110b6e4 100644
--- a/lib/task-spec/test/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
+++ b/lib/task-spec/test/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
@@ -16,7 +16,6 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*parallel_tensor_shape=*/std::nullopt,
         /*shard_coord=*/std::nullopt,
         /*accessor=*/std::nullopt,
-        /*instance=*/std::nullopt,
         /*tensor_type=*/std::nullopt,
     };
 
@@ -30,7 +29,6 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*parallel_tensor_shape=*/std::nullopt,
         /*shard_coord=*/std::nullopt,
         /*accessor=*/std::nullopt,
-        /*instance=*/std::nullopt,
         /*tensor_type=*/std::nullopt,
     };
 
@@ -44,7 +42,6 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*parallel_tensor_shape=*/std::nullopt,
         /*shard_coord=*/std::nullopt,
         /*accessor=*/std::nullopt,
-        /*instance=*/std::nullopt,
         /*tensor_type=*/std::nullopt,
     };
 
diff --git a/lib/task-spec/test/src/task-spec/dynamic_graph/machine_slicing.cc b/lib/task-spec/test/src/task-spec/dynamic_graph/machine_slicing.cc
index c28e12e0af..40d37f50df 100644
--- a/lib/task-spec/test/src/task-spec/dynamic_graph/machine_slicing.cc
+++ b/lib/task-spec/test/src/task-spec/dynamic_graph/machine_slicing.cc
@@ -76,7 +76,6 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/shard_coord,
           /*accessor=*/std::nullopt,
-          /*instance=*/std::nullopt,
           /*role=*/std::nullopt,
       };
     };
diff --git a/lib/task-spec/test/src/task-spec/dynamic_graph/pass_expansion.cc b/lib/task-spec/test/src/task-spec/dynamic_graph/pass_expansion.cc
index e57691b475..e8fcf2e40b 100644
--- a/lib/task-spec/test/src/task-spec/dynamic_graph/pass_expansion.cc
+++ b/lib/task-spec/test/src/task-spec/dynamic_graph/pass_expansion.cc
@@ -20,7 +20,6 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/std::nullopt,
           /*accessor=*/std::nullopt,
-          /*instance=*/std::nullopt,
           /*role=*/tensor_role,
       };
     };
@@ -114,7 +113,6 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/std::nullopt,
           /*accessor=*/std::nullopt,
-          /*instance=*/std::nullopt,
           /*role=*/tensor_role,
       };
     };
@@ -231,7 +229,6 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/std::nullopt,
           /*accessor=*/std::nullopt,
-          /*instance=*/std::nullopt,
           /*role=*/tensor_type,
       };
     };
diff --git a/lib/task-spec/test/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/test/src/task-spec/dynamic_graph/shard_expansion.cc
index 4d88dde805..23fbb6e514 100644
--- a/lib/task-spec/test/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/test/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -121,7 +121,6 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*parallel_tensor_shape=*/std::nullopt,
           /*shard_coord=*/shard_coord,
           /*accessor=*/std::nullopt,
-          /*instance=*/std::nullopt,
           /*role=*/std::nullopt,
       };
     };

From 351692a7284fe3a5ba65a2ee3823fe6f35efc240 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 16:51:32 -0800
Subject: [PATCH 028/113] Implement processor queries.

---
 .../include/realm-execution/realm_context.h   | 11 +++-
 .../parallel_computation_graph_instance.cc    |  7 ++-
 .../src/realm-execution/realm_context.cc      | 56 ++++++++++++++++++-
 .../src/realm-execution/realm_manager.cc      |  6 +-
 4 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index bfc1a53cd3..73d60e9f50 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -6,14 +6,16 @@
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "realm-execution/realm.h"
+#include <unordered_map>
 
 namespace FlexFlow {
 
 struct RealmContext {
 public:
-  RealmContext();
+  RealmContext(Realm::Processor);
   virtual ~RealmContext();
 
+  RealmContext() = delete;
   RealmContext(RealmContext const &) = delete;
   RealmContext(RealmContext &&) = delete;
 
@@ -23,6 +25,7 @@ struct RealmContext {
   Realm::Memory get_nearest_memory(Realm::Processor) const;
 
   // Current device context
+  Realm::Processor get_current_processor() const;
   Allocator &get_current_device_allocator() const;
   device_handle_t const &get_current_device_handle() const;
   device_id_t const &get_current_device_idx() const;
@@ -42,9 +45,15 @@ struct RealmContext {
   // Important: USER MUST BLOCK on event or else use it, or it WILL BE LOST
   [[nodiscard]] Realm::Event merge_outstanding_events();
 
+  void discover_machine_topology();
+
 protected:
   Realm::Runtime runtime;
+  Realm::Processor processor;
   std::vector<Realm::Event> outstanding_events;
+  std::unordered_map<std::pair<Realm::AddressSpace, Realm::Processor::Kind>,
+                     std::vector<Realm::Processor>>
+      processors;
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index e0e4f769d3..5d6aeddf83 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -10,6 +10,7 @@
 #include "task-spec/dynamic_graph/shard_expansion.h"
 #include "task-spec/dynamic_graph/update_insertion.h"
 #include "utils/exception.h"
+#include "utils/optional.h"
 
 namespace FlexFlow {
 
@@ -74,10 +75,12 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
   std::optional<DynamicValueAttrs> logit_grad_value;
   if (loss_attrs) {
     auto [dg2, label_v, logit_grad_v] = perform_loss_insertion(
-        dg, loss_attrs.value(), dynamic_tensor_guid_t{logit_tensor.value()});
+        dg,
+        assert_unwrap(loss_attrs),
+        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)});
     dg = dg2;
     logit_grad_value = logit_grad_v;
-    inputs.insert(std::pair{label_v, label_tensor.value()});
+    inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
   }
 
   dg = perform_update_insertion(dg, optimizer_attrs);
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 4c02c13aa0..bf5f337796 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -1,14 +1,19 @@
 #include "realm-execution/realm_context.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/tensor_dims.dtg.h"
+#include "pcg/device_type.dtg.h"
 #include "realm-execution/realm_task_id_t.h"
 #include "realm-execution/task_id_t.dtg.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/transform.h"
 #include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/one_to_many/one_to_many.h"
 #include "utils/positive_int/positive_int.h"
 
 namespace FlexFlow {
 
-RealmContext::RealmContext() {}
+RealmContext::RealmContext(Realm::Processor proc) : processor(proc) {}
 
 RealmContext::~RealmContext() {
   if (!this->outstanding_events.empty()) {
@@ -17,13 +22,45 @@ RealmContext::~RealmContext() {
   }
 }
 
+static std::tuple<Realm::AddressSpace, Realm::Processor::Kind, nonnegative_int>
+    convert_machine_space_coordinate(
+        MachineSpaceCoordinate const &device_coord) {
+  Realm::AddressSpace as = int{device_coord.node_idx};
+  Realm::Processor::Kind kind;
+  switch (device_coord.device_type) {
+    case DeviceType::CPU:
+      kind = Realm::Processor::Kind::LOC_PROC;
+      break;
+    case DeviceType::GPU:
+      kind = Realm::Processor::Kind::TOC_PROC;
+      break;
+    default:
+      PANIC("Unhandled DeviceType", fmt::to_string(device_coord.device_type));
+      break;
+  }
+  nonnegative_int proc_in_node = device_coord.device_idx;
+  return std::tuple{as, kind, proc_in_node};
+}
+
 Realm::Processor RealmContext::map_device_coord_to_processor(
     MachineSpaceCoordinate const &device_coord) {
-  NOT_IMPLEMENTED();
+  this->discover_machine_topology();
+  auto [as, kind, proc_in_node] =
+      convert_machine_space_coordinate(device_coord);
+  return this->processors.at(std::pair{as, kind}).at(int{proc_in_node});
 }
 
 Realm::Memory RealmContext::get_nearest_memory(Realm::Processor proc) const {
-  NOT_IMPLEMENTED();
+  // FIMXE: this isn't going to do what you expect until
+  // https://github.com/StanfordLegion/realm/pull/392 merges
+  Realm::Machine::MemoryQuery mq(Realm::Machine::get_machine());
+  mq.best_affinity_to(proc);
+  ASSERT(mq.count() > 0);
+  return mq.first();
+}
+
+Realm::Processor RealmContext::get_current_processor() const {
+  return this->processor;
 }
 
 Allocator &RealmContext::get_current_device_allocator() const {
@@ -136,4 +173,17 @@ Realm::Event RealmContext::merge_outstanding_events() {
   return result;
 }
 
+void RealmContext::discover_machine_topology() {
+  if (!this->processors.empty()) {
+    return;
+  }
+
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  for (Realm::Processor proc : pq) {
+    Realm::AddressSpace as = proc.address_space();
+    Realm::Processor::Kind kind = proc.kind();
+    this->processors[std::pair{as, kind}].push_back(proc);
+  }
+}
+
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 63c6266948..f8a3e4014b 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/realm_manager.h"
+#include "realm-execution/realm_context.h"
 #include "realm-execution/realm_task_id_t.h"
 #include "realm-execution/realm_task_registry.h"
 #include "realm-execution/task_id_t.dtg.h"
@@ -15,11 +16,12 @@ static void controller_task_wrapper(void const *args,
   std::function<void(RealmContext &)> thunk =
       *reinterpret_cast<std::function<void(RealmContext &)> const *>(args);
 
-  RealmContext ctx;
+  RealmContext ctx{proc};
   thunk(ctx);
 }
 
-RealmManager::RealmManager(int *argc, char ***argv) {
+RealmManager::RealmManager(int *argc, char ***argv)
+    : RealmContext(Realm::Processor::NO_PROC) {
   bool ok = this->runtime.init(argc, argv);
   ASSERT(ok);
 

From 7b23f47fd984b4812a0a542e023bd597b69f9ec3 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 7 Feb 2026 11:41:18 -0800
Subject: [PATCH 029/113] Enable PRealm.

---
 .flake/pkgs/realm.nix                                  | 10 ++++++----
 lib/realm-execution/include/realm-execution/realm.h    |  2 ++
 .../realm-execution/tensor_instance_backing.dtg.toml   |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/.flake/pkgs/realm.nix b/.flake/pkgs/realm.nix
index 1249c0ae28..b809573690 100644
--- a/.flake/pkgs/realm.nix
+++ b/.flake/pkgs/realm.nix
@@ -3,6 +3,7 @@
 , fetchFromGitHub
 , cmake
 , cudaPackages ? { }
+, zlib
 , maxDim ? 5
 }:
 
@@ -12,14 +13,13 @@ in
 
 stdenv.mkDerivation rec {
   pname = "realm";
-  version = "2025-01-06";
+  version = "2026-02-06";
 
-  # This version is compatible with Legion 7be1abd0207eb1126c7629b16d1123fa6f58ce9d
   src = fetchFromGitHub {
     owner = "StanfordLegion";
     repo = "realm";
-    rev = "0ef7edc8c012d4ab6a50805c044cec8a8edeae33";
-    sha256 = "sha256-57/a1lAgs+ajpRn0y0Lk1gP5nKt+N08WW0DIJP4vdho=";
+    rev = "0405b67ca14b586f7dec0dcddee194cecee7efa6";
+    sha256 = "sha256-iUPVV1rh3QuyDKgXuu8aDlaZGlNwcpPvPsSVLWp8tr4=";
   };
 
   nativeBuildInputs = [
@@ -29,11 +29,13 @@ stdenv.mkDerivation rec {
   cmakeFlags = [
     "-DBUILD_SHARED_LIBS=ON"
     "-DREALM_ENABLE_CUDA=ON"
+    "-DREALM_ENABLE_PREALM=ON"
     "-DREALM_MAX_DIM=${toString maxDim}"
   ];
 
   buildInputs = [
     cudatoolkit
+    zlib
   ];
 
   meta = with lib; {
diff --git a/lib/realm-execution/include/realm-execution/realm.h b/lib/realm-execution/include/realm-execution/realm.h
index 8123c9e9fa..b6913e66f5 100644
--- a/lib/realm-execution/include/realm-execution/realm.h
+++ b/lib/realm-execution/include/realm-execution/realm.h
@@ -1,6 +1,8 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 
+#define FLEXFLOW_USE_PREALM
+
 #ifdef FLEXFLOW_USE_PREALM
 #include <realm/prealm/prealm.h>
 #else
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
index bdf08df59c..e6a8bd58d9 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -4,7 +4,7 @@ type = "struct"
 features = [
   "eq",
   #"fmt",
-  "hash",
+  #"hash",
 ]
 
 includes = [

From 1f55dd3ae60a2b4ee9970d3c230096302448cf7b Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 10:18:32 -0800
Subject: [PATCH 030/113] Move tasks to dedicated file, stub out device state
 init, shuffle directories.

---
 .../distributed_device_state_initialization.h | 21 ++++++++++++++++
 .../{ => tasks}/realm_task_id_t.h             |  2 +-
 .../{ => tasks}/realm_task_registry.h         |  2 +-
 .../realm-execution/tasks/realm_tasks.h       | 15 ++++++++++++
 .../{ => tasks}/task_id_t.dtg.toml            |  0
 .../realm-execution/{ => tasks}/task_id_t.h   |  2 +-
 ...distributed_device_state_initialization.cc | 15 ++++++++++++
 .../parallel_computation_graph_instance.cc    | 17 ++++++-------
 .../src/realm-execution/realm_context.cc      |  4 ++--
 .../src/realm-execution/realm_manager.cc      | 23 +++---------------
 .../{ => tasks}/realm_task_id_t.cc            |  2 +-
 .../{ => tasks}/realm_task_registry.cc        | 21 ++++++++--------
 .../src/realm-execution/tasks/realm_tasks.cc  | 24 +++++++++++++++++++
 .../realm-execution/{ => tasks}/task_id_t.cc  |  2 +-
 14 files changed, 104 insertions(+), 46 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
 rename lib/realm-execution/include/realm-execution/{ => tasks}/realm_task_id_t.h (86%)
 rename lib/realm-execution/include/realm-execution/{ => tasks}/realm_task_registry.h (94%)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
 rename lib/realm-execution/include/realm-execution/{ => tasks}/task_id_t.dtg.toml (100%)
 rename lib/realm-execution/include/realm-execution/{ => tasks}/task_id_t.h (94%)
 create mode 100644 lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
 rename lib/realm-execution/src/realm-execution/{ => tasks}/realm_task_id_t.cc (82%)
 rename lib/realm-execution/src/realm-execution/{ => tasks}/realm_task_registry.cc (86%)
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc
 rename lib/realm-execution/src/realm-execution/{ => tasks}/task_id_t.cc (99%)

diff --git a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
new file mode 100644
index 0000000000..4121f10341
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_STATE_INITIALIZATION_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_STATE_INITIALIZATION_H
+
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
+    DynamicOpenDataflowGraph const &,
+    RealmContext &ctx,
+    ProfilingSettings const &profiling_settings,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
similarity index 86%
rename from lib/realm-execution/include/realm-execution/realm_task_id_t.h
rename to lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
index 8e6da1a2bd..cd5eba2f34 100644
--- a/lib/realm-execution/include/realm-execution/realm_task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
 
 #include "realm-execution/realm.h"
-#include "realm-execution/task_id_t.dtg.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
similarity index 94%
rename from lib/realm-execution/include/realm-execution/realm_task_registry.h
rename to lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
index f800b1d8c4..a0277382bf 100644
--- a/lib/realm-execution/include/realm-execution/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
 
 #include "realm-execution/realm.h"
-#include "realm-execution/task_id_t.dtg.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h b/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
new file mode 100644
index 0000000000..d2b104faa8
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASKS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASKS_H
+
+#include "realm-execution/realm.h"
+
+namespace FlexFlow {
+
+void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
+
+void controller_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
similarity index 100%
rename from lib/realm-execution/include/realm-execution/task_id_t.dtg.toml
rename to lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
diff --git a/lib/realm-execution/include/realm-execution/task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
similarity index 94%
rename from lib/realm-execution/include/realm-execution/task_id_t.h
rename to lib/realm-execution/include/realm-execution/tasks/task_id_t.h
index 38b82ad9e0..4a5d9299ae 100644
--- a/lib/realm-execution/include/realm-execution/task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
@@ -3,7 +3,7 @@
 
 #include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "realm-execution/task_id_t.dtg.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
 #include <optional>
 
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
new file mode 100644
index 0000000000..c6d0621f3d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -0,0 +1,15 @@
+#include "realm-execution/distributed_device_state_initialization.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
+    DynamicOpenDataflowGraph const &dg,
+    RealmContext &ctx,
+    ProfilingSettings const &profiling_settings,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index 5d6aeddf83..bb763334d5 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -1,6 +1,6 @@
 #include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
-#include "local-execution/device_state_initialization.h"
 #include "pcg/optimizer_attrs.h"
+#include "realm-execution/distributed_device_state_initialization.h"
 #include "realm-execution/instance_allocation.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
@@ -92,14 +92,15 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
         return backing.backing.at(lgv).first;
       });
 
-  dg = perform_device_state_initialization(dg,
-                                           ctx.get_current_device_allocator(),
-                                           profiling_settings,
-                                           ctx.get_current_device_handle(),
-                                           iteration_config,
-                                           optimizer_attrs,
-                                           ctx.get_current_device_idx());
+  dg = perform_distributed_device_state_initialization(
+      dg, ctx, profiling_settings, iteration_config, optimizer_attrs);
   NOT_IMPLEMENTED();
+
+  // TODO list:
+  //  * per-device state initialization (RPC mechanism?)
+  //  * Realm allocator
+  //  * task body
+  //  * external instances
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index bf5f337796..37f72ba86d 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -2,8 +2,8 @@
 #include "op-attrs/datatype.h"
 #include "op-attrs/tensor_dims.dtg.h"
 #include "pcg/device_type.dtg.h"
-#include "realm-execution/realm_task_id_t.h"
-#include "realm-execution/task_id_t.dtg.h"
+#include "realm-execution/tasks/realm_task_id_t.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/transform.h"
 #include "utils/exception.h"
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index f8a3e4014b..9d8b9f0b7f 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -1,25 +1,12 @@
 #include "realm-execution/realm_manager.h"
 #include "realm-execution/realm_context.h"
-#include "realm-execution/realm_task_id_t.h"
-#include "realm-execution/realm_task_registry.h"
-#include "realm-execution/task_id_t.dtg.h"
+#include "realm-execution/tasks/realm_task_id_t.h"
+#include "realm-execution/tasks/realm_task_registry.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
 
-static void controller_task_wrapper(void const *args,
-                                    size_t arglen,
-                                    void const *userdata,
-                                    size_t userlen,
-                                    Realm::Processor proc) {
-  ASSERT(arglen == sizeof(std::function<void(RealmContext &)>));
-  std::function<void(RealmContext &)> thunk =
-      *reinterpret_cast<std::function<void(RealmContext &)> const *>(args);
-
-  RealmContext ctx{proc};
-  thunk(ctx);
-}
-
 RealmManager::RealmManager(int *argc, char ***argv)
     : RealmContext(Realm::Processor::NO_PROC) {
   bool ok = this->runtime.init(argc, argv);
@@ -27,10 +14,6 @@ RealmManager::RealmManager(int *argc, char ***argv)
 
   // Register all tasks at initialization time so we don't need to later
   register_all_tasks().wait();
-  register_task(Realm::Processor::LOC_PROC,
-                task_id_t::CONTROLLER_TASK_ID,
-                controller_task_wrapper)
-      .wait();
 }
 
 RealmManager::~RealmManager() {
diff --git a/lib/realm-execution/src/realm-execution/realm_task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc
similarity index 82%
rename from lib/realm-execution/src/realm-execution/realm_task_id_t.cc
rename to lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc
index 50b23dfe86..ec1aa143a6 100644
--- a/lib/realm-execution/src/realm-execution/realm_task_id_t.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc
@@ -1,4 +1,4 @@
-#include "realm-execution/realm_task_id_t.h"
+#include "realm-execution/tasks/realm_task_id_t.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/src/realm-execution/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
similarity index 86%
rename from lib/realm-execution/src/realm-execution/realm_task_registry.cc
rename to lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index 436a6af3f3..7e30edbc9f 100644
--- a/lib/realm-execution/src/realm-execution/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -1,14 +1,10 @@
-#include "realm-execution/realm_task_registry.h"
-#include "realm-execution/realm_task_id_t.h"
+#include "realm-execution/tasks/realm_task_registry.h"
+#include "realm-execution/tasks/realm_task_id_t.h"
+#include "realm-execution/tasks/realm_tasks.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
 
-static void operation_task_wrapper(
-    void const *, size_t, void const *, size_t, Realm::Processor) {
-  NOT_IMPLEMENTED();
-}
-
 Realm::Event register_task(Realm::Processor::Kind target_kind,
                            task_id_t func_id,
                            void (*task_body)(void const *,
@@ -110,12 +106,15 @@ Realm::Event register_all_tasks() {
   };
 
   for (task_id_t task_id : task_ids) {
-    pending_registrations.push_back(register_task(
-        Realm::Processor::LOC_PROC, task_id, operation_task_wrapper));
-    pending_registrations.push_back(register_task(
-        Realm::Processor::TOC_PROC, task_id, operation_task_wrapper));
+    pending_registrations.push_back(
+        register_task(Realm::Processor::LOC_PROC, task_id, op_task_body));
+    pending_registrations.push_back(
+        register_task(Realm::Processor::TOC_PROC, task_id, op_task_body));
   }
 
+  pending_registrations.push_back(register_task(Realm::Processor::LOC_PROC,
+                                                task_id_t::CONTROLLER_TASK_ID,
+                                                controller_task_body));
   return Realm::Event::merge_events(pending_registrations);
 }
 
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc b/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc
new file mode 100644
index 0000000000..a50f7f3e47
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc
@@ -0,0 +1,24 @@
+#include "realm-execution/tasks/realm_tasks.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+void op_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor) {
+  NOT_IMPLEMENTED();
+}
+
+void controller_task_body(void const *args,
+                          size_t arglen,
+                          void const *userdata,
+                          size_t userlen,
+                          Realm::Processor proc) {
+  ASSERT(arglen == sizeof(std::function<void(RealmContext &)>));
+  std::function<void(RealmContext &)> thunk =
+      *reinterpret_cast<std::function<void(RealmContext &)> const *>(args);
+
+  RealmContext ctx{proc};
+  thunk(ctx);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
similarity index 99%
rename from lib/realm-execution/src/realm-execution/task_id_t.cc
rename to lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
index 3521f50c02..5a99f2bea8 100644
--- a/lib/realm-execution/src/realm-execution/task_id_t.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
@@ -1,4 +1,4 @@
-#include "realm-execution/task_id_t.h"
+#include "realm-execution/tasks/task_id_t.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"

From ea2b652cd0d851efecf0ae962f0029daab690314 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 10:47:41 -0800
Subject: [PATCH 031/113] Make use of task args struct.

---
 .../realm-execution/tasks/realm_tasks.h       | 20 +++++++++++++++++++
 .../src/realm-execution/tasks/realm_tasks.cc  | 14 +++++++++----
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h b/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
index d2b104faa8..ceda961914 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
@@ -2,11 +2,31 @@
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASKS_H
 
 #include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include <type_traits>
 
 namespace FlexFlow {
 
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
+// TODO: at some point we're going to have to actually serialize these, but for
+// now just pass the pointer and assume we're running inside a single address
+// space
+struct DeviceInitTaskArgs {
+public:
+  DynamicNodeInvocation *invocation;
+};
+static_assert(std::has_unique_object_representations_v<DeviceInitTaskArgs>);
+
+void device_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+struct ControllerTaskArgs {
+public:
+  std::function<void(RealmContext &)> thunk;
+};
+
 void controller_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc b/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc
index a50f7f3e47..b1da1f0694 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc
@@ -1,5 +1,6 @@
 #include "realm-execution/tasks/realm_tasks.h"
 #include "realm-execution/realm_context.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -8,17 +9,22 @@ void op_task_body(
   NOT_IMPLEMENTED();
 }
 
+void device_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor) {
+  NOT_IMPLEMENTED();
+}
+
 void controller_task_body(void const *args,
                           size_t arglen,
                           void const *userdata,
                           size_t userlen,
                           Realm::Processor proc) {
-  ASSERT(arglen == sizeof(std::function<void(RealmContext &)>));
-  std::function<void(RealmContext &)> thunk =
-      *reinterpret_cast<std::function<void(RealmContext &)> const *>(args);
+  ASSERT(arglen == sizeof(ControllerTaskArgs));
+  ControllerTaskArgs task_args =
+      *reinterpret_cast<ControllerTaskArgs const *>(args);
 
   RealmContext ctx{proc};
-  thunk(ctx);
+  task_args.thunk(ctx);
 }
 
 } // namespace FlexFlow

From 99b3f4c6ab16e27e058ec324ba7198852f57bf40 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 10:54:01 -0800
Subject: [PATCH 032/113] Use task args struct.

---
 lib/realm-execution/src/realm-execution/realm_manager.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 9d8b9f0b7f..dec2ed7847 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -2,6 +2,7 @@
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tasks/realm_task_id_t.h"
 #include "realm-execution/tasks/realm_task_registry.h"
+#include "realm-execution/tasks/realm_tasks.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "utils/exception.h"
 
@@ -29,11 +30,13 @@ Realm::Event
           .only_kind(Realm::Processor::LOC_PROC)
           .first();
 
+  ControllerTaskArgs task_args;
+  task_args.thunk = thunk;
   Realm::Event task_complete = this->runtime.collective_spawn(
       target_proc,
       get_realm_task_id_for_task_id(task_id_t::CONTROLLER_TASK_ID),
-      &thunk,
-      sizeof(thunk));
+      &task_args,
+      sizeof(task_args));
   this->outstanding_events.push_back(task_complete);
   return task_complete;
 }

From 5338dfc3b0837625de4092ff63c17e21a8940ff8 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 14:26:45 -0800
Subject: [PATCH 033/113] Refactor task APIs.

---
 .../include/realm-execution/realm_context.h   | 18 +++++++
 .../tasks/impl/controller_task.h              | 19 +++++++
 .../tasks/impl/device_init_return_task.h      | 21 ++++++++
 .../tasks/impl/device_init_task.h             | 24 +++++++++
 .../realm-execution/tasks/impl/op_task.h      | 21 ++++++++
 .../tasks/realm_task_registry.h               |  4 +-
 .../realm-execution/tasks/realm_tasks.h       | 35 ------------
 .../realm-execution/tasks/task_id_t.dtg.toml  |  3 ++
 .../include/realm-execution/tasks/task_id_t.h |  4 +-
 .../src/realm-execution/realm_context.cc      | 35 ++++++++++++
 .../src/realm-execution/realm_manager.cc      | 15 +-----
 .../tasks/impl/controller_task.cc             | 37 +++++++++++++
 .../tasks/impl/device_init_return_task.cc     | 49 +++++++++++++++++
 .../tasks/impl/device_init_task.cc            | 54 +++++++++++++++++++
 .../src/realm-execution/tasks/impl/op_task.cc | 48 +++++++++++++++++
 .../tasks/realm_task_registry.cc              |  5 +-
 .../src/realm-execution/tasks/realm_tasks.cc  | 30 -----------
 17 files changed, 339 insertions(+), 83 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
 delete mode 100644 lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
 delete mode 100644 lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc

diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 73d60e9f50..422c4f4027 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -6,6 +6,7 @@
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "realm-execution/realm.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
 #include <unordered_map>
 
 namespace FlexFlow {
@@ -30,6 +31,23 @@ struct RealmContext {
   device_handle_t const &get_current_device_handle() const;
   device_id_t const &get_current_device_idx() const;
 
+  // Task creation
+  Realm::Event spawn_task(Realm::Processor proc,
+                          task_id_t task_id,
+                          void const *args,
+                          size_t arglen,
+                          Realm::ProfilingRequestSet const &requests,
+                          Realm::Event wait_on = Realm::Event::NO_EVENT,
+                          int priority = 0);
+
+  Realm::Event
+      collective_spawn_task(Realm::Processor target_proc,
+                            task_id_t task_id,
+                            void const *args,
+                            size_t arglen,
+                            Realm::Event wait_on = Realm::Event::NO_EVENT,
+                            int priority = 0);
+
   // Instance management
   std::pair<Realm::RegionInstance, Realm::Event>
       create_instance(Realm::Memory memory,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
new file mode 100644
index 0000000000..d4c397bb37
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_CONTROLLER_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_CONTROLLER_TASK_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+void controller_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event
+    collective_spawn_controller_task(RealmContext &ctx,
+                                     Realm::Processor &target_proc,
+                                     std::function<void(RealmContext &)> thunk);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h
new file mode 100644
index 0000000000..fc6c8bdb9f
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_RETURN_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_RETURN_TASK_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/device_specific_per_device_op_state.dtg.h"
+
+namespace FlexFlow {
+
+void device_init_return_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event spawn_device_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificPerDeviceOpState const &result,
+    DeviceSpecificPerDeviceOpState *origin_result_ptr);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
new file mode 100644
index 0000000000..bd4ca269df
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_TASK_H
+
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/device_specific_per_device_op_state.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+
+namespace FlexFlow {
+
+void device_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event
+    spawn_device_init_task(RealmContext &ctx,
+                           Realm::Processor &target_proc,
+                           DynamicNodeInvocation const &invocation,
+                           std::optional<OptimizerAttrs> const &optimizer_attrs,
+                           DeviceSpecificPerDeviceOpState *result_ptr);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
new file mode 100644
index 0000000000..4c3e6d38d1
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_OP_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_OP_TASK_H
+
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+
+namespace FlexFlow {
+
+void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event
+    spawn_op_task(RealmContext &ctx,
+                  Realm::Processor &target_proc,
+                  DynamicNodeInvocation const &invocation,
+                  std::optional<OptimizerAttrs> const &optimizer_attrs);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
index a0277382bf..8114f1a82c 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_REGISTRY_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_REGISTRY_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_REGISTRY_H
 
 #include "realm-execution/realm.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h b/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
deleted file mode 100644
index ceda961914..0000000000
--- a/lib/realm-execution/include/realm-execution/tasks/realm_tasks.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASKS_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASKS_H
-
-#include "realm-execution/realm.h"
-#include "realm-execution/realm_context.h"
-#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
-#include <type_traits>
-
-namespace FlexFlow {
-
-void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
-
-// TODO: at some point we're going to have to actually serialize these, but for
-// now just pass the pointer and assume we're running inside a single address
-// space
-struct DeviceInitTaskArgs {
-public:
-  DynamicNodeInvocation *invocation;
-};
-static_assert(std::has_unique_object_representations_v<DeviceInitTaskArgs>);
-
-void device_init_task_body(
-    void const *, size_t, void const *, size_t, Realm::Processor);
-
-struct ControllerTaskArgs {
-public:
-  std::function<void(RealmContext &)> thunk;
-};
-
-void controller_task_body(
-    void const *, size_t, void const *, size_t, Realm::Processor);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
index 0336bc81a4..34e5183488 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
@@ -11,6 +11,9 @@ features = [
 [[values]]
 name = "CONTROLLER_TASK_ID"
 
+[[values]]
+name = "DEVICE_INIT_RETURN_TASK_ID"
+
 [[values]]
 name = "IMAGE_INIT_TASK_ID"
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
index 4a5d9299ae..53945d2e5b 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASK_ID_T_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASK_ID_T_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_TASK_ID_T_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_TASK_ID_T_H
 
 #include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 37f72ba86d..7e6c73c9e7 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -74,6 +74,41 @@ device_id_t const &RealmContext::get_current_device_idx() const {
   NOT_IMPLEMENTED();
 }
 
+Realm::Event
+    RealmContext::spawn_task(Realm::Processor proc,
+                             task_id_t task_id,
+                             void const *args,
+                             size_t arglen,
+                             Realm::ProfilingRequestSet const &requests,
+                             Realm::Event wait_on,
+                             int priority) {
+  Realm::Event result = proc.spawn(get_realm_task_id_for_task_id(task_id),
+                                   args,
+                                   arglen,
+                                   requests,
+                                   wait_on,
+                                   priority);
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
+Realm::Event RealmContext::collective_spawn_task(Realm::Processor target_proc,
+                                                 task_id_t task_id,
+                                                 void const *args,
+                                                 size_t arglen,
+                                                 Realm::Event wait_on,
+                                                 int priority) {
+  Realm::Event result =
+      this->runtime.collective_spawn(target_proc,
+                                     get_realm_task_id_for_task_id(task_id),
+                                     args,
+                                     arglen,
+                                     wait_on,
+                                     priority);
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
 template <int N>
 static Realm::Rect<N> rect_from_dims(TensorDims const &dims) {
   std::vector<int> values{dims.ff_ordered.begin(), dims.ff_ordered.end()};
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index dec2ed7847..7233103cc3 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -1,10 +1,7 @@
 #include "realm-execution/realm_manager.h"
 #include "realm-execution/realm_context.h"
-#include "realm-execution/tasks/realm_task_id_t.h"
+#include "realm-execution/tasks/impl/controller_task.h"
 #include "realm-execution/tasks/realm_task_registry.h"
-#include "realm-execution/tasks/realm_tasks.h"
-#include "realm-execution/tasks/task_id_t.dtg.h"
-#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -30,15 +27,7 @@ Realm::Event
           .only_kind(Realm::Processor::LOC_PROC)
           .first();
 
-  ControllerTaskArgs task_args;
-  task_args.thunk = thunk;
-  Realm::Event task_complete = this->runtime.collective_spawn(
-      target_proc,
-      get_realm_task_id_for_task_id(task_id_t::CONTROLLER_TASK_ID),
-      &task_args,
-      sizeof(task_args));
-  this->outstanding_events.push_back(task_complete);
-  return task_complete;
+  return collective_spawn_controller_task(*this, target_proc, thunk);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
new file mode 100644
index 0000000000..2fd5cee52d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
@@ -0,0 +1,37 @@
+#include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tasks/task_id_t.h"
+
+namespace FlexFlow {
+
+struct ControllerTaskArgs {
+public:
+  std::function<void(RealmContext &)> thunk;
+};
+
+void controller_task_body(void const *args,
+                          size_t arglen,
+                          void const *userdata,
+                          size_t userlen,
+                          Realm::Processor proc) {
+  ASSERT(arglen == sizeof(ControllerTaskArgs));
+  ControllerTaskArgs task_args =
+      *reinterpret_cast<ControllerTaskArgs const *>(args);
+
+  RealmContext ctx{proc};
+  task_args.thunk(ctx);
+}
+
+Realm::Event collective_spawn_controller_task(
+    RealmContext &ctx,
+    Realm::Processor &target_proc,
+    std::function<void(RealmContext &)> thunk) {
+  ControllerTaskArgs task_args;
+  task_args.thunk = thunk;
+
+  return ctx.collective_spawn_task(target_proc,
+                                   task_id_t::CONTROLLER_TASK_ID,
+                                   &task_args,
+                                   sizeof(task_args));
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
new file mode 100644
index 0000000000..fa421cda30
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
@@ -0,0 +1,49 @@
+#include "realm-execution/tasks/impl/device_init_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+// FIXME: Can't make this trivially copyable?
+struct DeviceInitReturnTaskArgs {
+public:
+  DeviceInitReturnTaskArgs() = delete;
+  DeviceInitReturnTaskArgs(DeviceSpecificPerDeviceOpState result,
+                           Realm::Processor origin_proc,
+                           DeviceSpecificPerDeviceOpState *origin_result_ptr)
+      : result(result), origin_proc(origin_proc),
+        origin_result_ptr(origin_result_ptr) {}
+
+public:
+  DeviceSpecificPerDeviceOpState result;
+  Realm::Processor origin_proc;
+  DeviceSpecificPerDeviceOpState *origin_result_ptr;
+};
+
+void device_init_return_task_body(void const *args,
+                                  size_t arglen,
+                                  void const *userdata,
+                                  size_t userlen,
+                                  Realm::Processor proc) {
+  ASSERT(arglen == sizeof(DeviceInitReturnTaskArgs));
+  DeviceInitReturnTaskArgs task_args =
+      *reinterpret_cast<DeviceInitReturnTaskArgs const *>(args);
+
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+  *task_args.origin_result_ptr = task_args.result;
+}
+
+Realm::Event spawn_device_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificPerDeviceOpState const &result,
+    DeviceSpecificPerDeviceOpState *origin_result_ptr) {
+  DeviceInitReturnTaskArgs task_args{result, origin_proc, origin_result_ptr};
+
+  return ctx.spawn_task(origin_proc,
+                        task_id_t::DEVICE_INIT_RETURN_TASK_ID,
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{});
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
new file mode 100644
index 0000000000..0deb8407c4
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
@@ -0,0 +1,54 @@
+#include "realm-execution/tasks/impl/device_init_task.h"
+#include "realm-execution/tasks/task_id_t.h"
+#include "utils/optional.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+// TODO: at some point we're going to have to actually serialize these, but for
+// now just pass the pointer and assume we're running inside a single address
+// space
+struct DeviceInitTaskArgs {
+public:
+  DynamicNodeInvocation const *invocation;
+  Realm::Processor origin_proc;
+  DeviceSpecificPerDeviceOpState *origin_result_ptr;
+};
+static_assert(std::has_unique_object_representations_v<DeviceInitTaskArgs>);
+
+void device_init_task_body(void const *args,
+                           size_t arglen,
+                           void const *userdata,
+                           size_t userlen,
+                           Realm::Processor proc) {
+  ASSERT(arglen == sizeof(DeviceInitTaskArgs));
+  DeviceInitTaskArgs task_args =
+      *reinterpret_cast<DeviceInitTaskArgs const *>(args);
+
+  // FIXME: not safe to dereference unless we're on the same address space
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+
+  RealmContext ctx{proc};
+  NOT_IMPLEMENTED();
+}
+
+Realm::Event
+    spawn_device_init_task(RealmContext &ctx,
+                           Realm::Processor &target_proc,
+                           DynamicNodeInvocation const &invocation,
+                           std::optional<OptimizerAttrs> const &optimizer_attrs,
+                           DeviceSpecificPerDeviceOpState *result_ptr) {
+  DeviceInitTaskArgs task_args;
+  task_args.invocation = &invocation;
+  task_args.origin_proc = ctx.get_current_processor();
+  task_args.origin_result_ptr = result_ptr;
+
+  return ctx.spawn_task(target_proc,
+                        assert_unwrap(get_init_task_id_for_op_attrs(
+                            assert_unwrap(invocation.node_attrs.op_attrs))),
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{});
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
new file mode 100644
index 0000000000..9d9a36e2d5
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -0,0 +1,48 @@
+#include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tasks/task_id_t.h"
+#include "utils/optional.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+// TODO: at some point we're going to have to actually serialize these, but for
+// now just pass the pointer and assume we're running inside a single address
+// space
+struct OpTaskArgs {
+public:
+  DynamicNodeInvocation const *invocation;
+  Realm::Processor origin_proc;
+};
+static_assert(std::has_unique_object_representations_v<OpTaskArgs>);
+
+void op_task_body(void const *args,
+                  size_t arglen,
+                  void const *userdata,
+                  size_t userlen,
+                  Realm::Processor proc) {
+  ASSERT(arglen == sizeof(OpTaskArgs));
+  OpTaskArgs task_args = *reinterpret_cast<OpTaskArgs const *>(args);
+
+  // FIXME: not safe to dereference unless we're on the same address space
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+
+  RealmContext ctx{proc};
+  NOT_IMPLEMENTED();
+}
+
+Realm::Event
+    spawn_op_task(RealmContext &ctx,
+                  Realm::Processor &target_proc,
+                  DynamicNodeInvocation const &invocation,
+                  std::optional<OptimizerAttrs> const &optimizer_attrs) {
+  OpTaskArgs task_args;
+  task_args.invocation = &invocation;
+  return ctx.spawn_task(
+      target_proc,
+      assert_unwrap(get_task_id_for_op(invocation.node_attrs, optimizer_attrs)),
+      &task_args,
+      sizeof(task_args),
+      Realm::ProfilingRequestSet{});
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index 7e30edbc9f..c604d1b06a 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -1,6 +1,9 @@
 #include "realm-execution/tasks/realm_task_registry.h"
+#include "realm-execution/tasks/impl/controller_task.h"
+#include "realm-execution/tasks/impl/device_init_return_task.h"
+#include "realm-execution/tasks/impl/device_init_task.h"
+#include "realm-execution/tasks/impl/op_task.h"
 #include "realm-execution/tasks/realm_task_id_t.h"
-#include "realm-execution/tasks/realm_tasks.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc b/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc
deleted file mode 100644
index b1da1f0694..0000000000
--- a/lib/realm-execution/src/realm-execution/tasks/realm_tasks.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "realm-execution/tasks/realm_tasks.h"
-#include "realm-execution/realm_context.h"
-#include "utils/exception.h"
-
-namespace FlexFlow {
-
-void op_task_body(
-    void const *, size_t, void const *, size_t, Realm::Processor) {
-  NOT_IMPLEMENTED();
-}
-
-void device_init_task_body(
-    void const *, size_t, void const *, size_t, Realm::Processor) {
-  NOT_IMPLEMENTED();
-}
-
-void controller_task_body(void const *args,
-                          size_t arglen,
-                          void const *userdata,
-                          size_t userlen,
-                          Realm::Processor proc) {
-  ASSERT(arglen == sizeof(ControllerTaskArgs));
-  ControllerTaskArgs task_args =
-      *reinterpret_cast<ControllerTaskArgs const *>(args);
-
-  RealmContext ctx{proc};
-  task_args.thunk(ctx);
-}
-
-} // namespace FlexFlow

From f87866f10a4eb46fd4ba69e47d2ebb12af1aa54c Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 14:45:44 -0800
Subject: [PATCH 034/113] Finish implementation of device init task.

---
 .../tasks/impl/device_init_task.h             | 15 +++---
 .../realm-execution/tasks/realm_task_id_t.h   |  4 +-
 .../tasks/impl/device_init_task.cc            | 50 ++++++++++++++++---
 .../tasks/realm_task_registry.cc              | 13 ++++-
 4 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
index bd4ca269df..ebce5fed4c 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
@@ -1,23 +1,26 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_TASK_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_TASK_H
 
+#include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
 
 namespace FlexFlow {
 
 void device_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event
-    spawn_device_init_task(RealmContext &ctx,
-                           Realm::Processor &target_proc,
-                           DynamicNodeInvocation const &invocation,
-                           std::optional<OptimizerAttrs> const &optimizer_attrs,
-                           DeviceSpecificPerDeviceOpState *result_ptr);
+Realm::Event spawn_device_init_task(RealmContext &ctx,
+                                    Realm::Processor &target_proc,
+                                    DynamicNodeInvocation const &invocation,
+                                    ProfilingSettings const &profiling_settings,
+                                    FFIterationConfig const &iteration_config,
+                                    OptimizerAttrs const &optimizer_attrs,
+                                    DeviceSpecificPerDeviceOpState *result_ptr);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
index cd5eba2f34..a3c6891fb0 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_TASK_ID_T_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_ID_T_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_ID_T_H
 
 #include "realm-execution/realm.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
index 0deb8407c4..c27fc5802b 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
@@ -1,6 +1,9 @@
 #include "realm-execution/tasks/impl/device_init_task.h"
+#include "local-execution/device_state_initialization.h"
+#include "realm-execution/tasks/impl/device_init_return_task.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "utils/optional.h"
+#include <optional>
 #include <type_traits>
 
 namespace FlexFlow {
@@ -9,8 +12,22 @@ namespace FlexFlow {
 // now just pass the pointer and assume we're running inside a single address
 // space
 struct DeviceInitTaskArgs {
+  DeviceInitTaskArgs() = delete;
+  DeviceInitTaskArgs(DynamicNodeInvocation const *invocation,
+                     ProfilingSettings const *profiling_settings,
+                     FFIterationConfig const *iteration_config,
+                     OptimizerAttrs const *optimizer_attrs,
+                     Realm::Processor origin_proc,
+                     DeviceSpecificPerDeviceOpState *origin_result_ptr)
+      : invocation(invocation), profiling_settings(profiling_settings),
+        iteration_config(iteration_config), optimizer_attrs(optimizer_attrs),
+        origin_proc(origin_proc), origin_result_ptr(origin_result_ptr) {}
+
 public:
   DynamicNodeInvocation const *invocation;
+  ProfilingSettings const *profiling_settings;
+  FFIterationConfig const *iteration_config;
+  OptimizerAttrs const *optimizer_attrs;
   Realm::Processor origin_proc;
   DeviceSpecificPerDeviceOpState *origin_result_ptr;
 };
@@ -29,19 +46,40 @@ void device_init_task_body(void const *args,
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
 
   RealmContext ctx{proc};
-  NOT_IMPLEMENTED();
+  DynamicNodeInvocation result_invocation =
+      initialize_node(*task_args.invocation,
+                      ctx.get_current_device_allocator(),
+                      *task_args.profiling_settings,
+                      ctx.get_current_device_handle(),
+                      *task_args.iteration_config,
+                      *task_args.optimizer_attrs,
+                      ctx.get_current_device_idx());
+  std::optional<DeviceSpecificPerDeviceOpState> result_state =
+      result_invocation.node_attrs.per_device_op_state;
+  if (result_state) {
+    spawn_device_init_return_task(ctx,
+                                  task_args.origin_proc,
+                                  assert_unwrap(result_state),
+                                  task_args.origin_result_ptr);
+  }
 }
 
 Realm::Event
     spawn_device_init_task(RealmContext &ctx,
                            Realm::Processor &target_proc,
                            DynamicNodeInvocation const &invocation,
-                           std::optional<OptimizerAttrs> const &optimizer_attrs,
+                           ProfilingSettings const &profiling_settings,
+                           FFIterationConfig const &iteration_config,
+                           OptimizerAttrs const &optimizer_attrs,
                            DeviceSpecificPerDeviceOpState *result_ptr) {
-  DeviceInitTaskArgs task_args;
-  task_args.invocation = &invocation;
-  task_args.origin_proc = ctx.get_current_processor();
-  task_args.origin_result_ptr = result_ptr;
+  DeviceInitTaskArgs task_args{
+      &invocation,
+      &profiling_settings,
+      &iteration_config,
+      &optimizer_attrs,
+      ctx.get_current_processor(),
+      result_ptr,
+  };
 
   return ctx.spawn_task(target_proc,
                         assert_unwrap(get_init_task_id_for_op_attrs(
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index c604d1b06a..c63d4727a9 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -26,7 +26,7 @@ Realm::Event register_task(Realm::Processor::Kind target_kind,
 Realm::Event register_all_tasks() {
   std::vector<Realm::Event> pending_registrations;
 
-  std::vector<task_id_t> task_ids = {
+  std::vector<task_id_t> init_task_ids = {
       // Init tasks
       task_id_t::BATCHNORM_INIT_TASK_ID,
       task_id_t::COMBINE_INIT_TASK_ID,
@@ -44,7 +44,14 @@ Realm::Event register_all_tasks() {
       task_id_t::REPARTITION_INIT_TASK_ID,
       task_id_t::REPLICATE_INIT_TASK_ID,
       task_id_t::SOFTMAX_INIT_TASK_ID,
+  };
 
+  for (task_id_t task_id : init_task_ids) {
+    pending_registrations.push_back(register_task(
+        Realm::Processor::TOC_PROC, task_id, device_init_task_body));
+  }
+
+  std::vector<task_id_t> task_ids = {
       // Forward tasks
       task_id_t::BATCHMATMUL_FWD_TASK_ID,
       task_id_t::BATCHNORM_FWD_TASK_ID,
@@ -118,6 +125,10 @@ Realm::Event register_all_tasks() {
   pending_registrations.push_back(register_task(Realm::Processor::LOC_PROC,
                                                 task_id_t::CONTROLLER_TASK_ID,
                                                 controller_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_INIT_RETURN_TASK_ID,
+                    device_init_return_task_body));
   return Realm::Event::merge_events(pending_registrations);
 }
 

From b90b5808f5878fbea0b1e5681a9691573fb9d607 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 15:14:24 -0800
Subject: [PATCH 035/113] Finish implementation of device state initialization.

---
 .../tasks/impl/device_init_task.h             | 15 ++---
 ...distributed_device_state_initialization.cc | 57 ++++++++++++++++++-
 .../tasks/impl/device_init_task.cc            | 29 +++++-----
 3 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
index ebce5fed4c..af07139483 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
@@ -14,13 +14,14 @@ namespace FlexFlow {
 void device_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event spawn_device_init_task(RealmContext &ctx,
-                                    Realm::Processor &target_proc,
-                                    DynamicNodeInvocation const &invocation,
-                                    ProfilingSettings const &profiling_settings,
-                                    FFIterationConfig const &iteration_config,
-                                    OptimizerAttrs const &optimizer_attrs,
-                                    DeviceSpecificPerDeviceOpState *result_ptr);
+std::optional<Realm::Event>
+    spawn_device_init_task(RealmContext &ctx,
+                           Realm::Processor &target_proc,
+                           DynamicNodeInvocation const &invocation,
+                           ProfilingSettings const &profiling_settings,
+                           FFIterationConfig const &iteration_config,
+                           OptimizerAttrs const &optimizer_attrs,
+                           DeviceSpecificPerDeviceOpState *result_ptr);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
index c6d0621f3d..f7fcea87e7 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -1,5 +1,11 @@
 #include "realm-execution/distributed_device_state_initialization.h"
-#include "utils/exception.h"
+#include "local-execution/device_state_initialization.h"
+#include "realm-execution/tasks/impl/device_init_task.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "utils/optional.h"
+#include <optional>
+#include <unordered_map>
 
 namespace FlexFlow {
 
@@ -9,7 +15,54 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
     ProfilingSettings const &profiling_settings,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs) {
-  NOT_IMPLEMENTED();
+
+  // Initialize all operators and save the per-device op state
+  ASSERT(no_nodes_are_initialized(dg));
+
+  std::unordered_map<DynamicNodeInvocation, DeviceSpecificPerDeviceOpState *>
+      result_map;
+  for (DynamicNodeInvocation const &invocation : dg.invocations) {
+    Realm::Processor target_proc = ctx.map_device_coord_to_processor(
+        assert_unwrap(invocation.node_attrs.device_coord));
+
+    // FIXME: in the absense of a real serializer we're just tossing around raw
+    // bytes, which means we need to bypass the constructor for this type (yes,
+    // ugh)
+    DeviceSpecificPerDeviceOpState *output =
+        static_cast<DeviceSpecificPerDeviceOpState *>(
+            malloc(sizeof(DeviceSpecificPerDeviceOpState)));
+    std::optional<Realm::Event> result =
+        spawn_device_init_task(ctx,
+                               target_proc,
+                               invocation,
+                               profiling_settings,
+                               iteration_config,
+                               optimizer_attrs,
+                               output);
+    if (result) {
+      result_map[invocation] = output;
+    } else {
+      free(output);
+    }
+  }
+
+  ctx.get_outstanding_events().wait();
+
+  DynamicOpenDataflowGraph result = transform_dynamic_invocation_set(
+      dg, [&](DynamicNodeInvocation const &invocation) {
+        DynamicNodeInvocation result = invocation;
+        auto device_state = result_map.find(invocation);
+        if (device_state != result_map.end()) {
+          result.node_attrs.per_device_op_state = *device_state->second;
+        }
+        return result;
+      });
+
+  for (auto &[invocation, output] : result_map) {
+    free(output);
+  }
+
+  return result;
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
index c27fc5802b..91b753d639 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
@@ -1,6 +1,7 @@
 #include "realm-execution/tasks/impl/device_init_task.h"
 #include "local-execution/device_state_initialization.h"
 #include "realm-execution/tasks/impl/device_init_return_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "utils/optional.h"
 #include <optional>
@@ -56,15 +57,13 @@ void device_init_task_body(void const *args,
                       ctx.get_current_device_idx());
   std::optional<DeviceSpecificPerDeviceOpState> result_state =
       result_invocation.node_attrs.per_device_op_state;
-  if (result_state) {
-    spawn_device_init_return_task(ctx,
-                                  task_args.origin_proc,
-                                  assert_unwrap(result_state),
-                                  task_args.origin_result_ptr);
-  }
+  spawn_device_init_return_task(ctx,
+                                task_args.origin_proc,
+                                assert_unwrap(result_state),
+                                task_args.origin_result_ptr);
 }
 
-Realm::Event
+std::optional<Realm::Event>
     spawn_device_init_task(RealmContext &ctx,
                            Realm::Processor &target_proc,
                            DynamicNodeInvocation const &invocation,
@@ -81,12 +80,16 @@ Realm::Event
       result_ptr,
   };
 
-  return ctx.spawn_task(target_proc,
-                        assert_unwrap(get_init_task_id_for_op_attrs(
-                            assert_unwrap(invocation.node_attrs.op_attrs))),
-                        &task_args,
-                        sizeof(task_args),
-                        Realm::ProfilingRequestSet{});
+  std::optional<task_id_t> task_id = get_init_task_id_for_op_attrs(
+      assert_unwrap(invocation.node_attrs.op_attrs));
+  if (task_id) {
+    return ctx.spawn_task(target_proc,
+                          assert_unwrap(task_id),
+                          &task_args,
+                          sizeof(task_args),
+                          Realm::ProfilingRequestSet{});
+  }
+  return std::nullopt;
 }
 
 } // namespace FlexFlow

From c635bad2ce1495753d8a8257796ced3e340566d4 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 15:15:51 -0800
Subject: [PATCH 036/113] Block on initialization.

---
 .../parallel_computation_graph_instance.cc                    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index bb763334d5..cdb3e5fe46 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -87,6 +87,10 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
   dg = perform_shard_expansion(dg);
   TensorInstanceBacking backing = perform_instance_allocation(dg, inputs, ctx);
 
+  // FIXME: for now we're going to be lazy and block on everything rather than
+  // do fine-grained dependencies
+  ctx.get_outstanding_events().wait();
+
   std::optional<Realm::RegionInstance> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
         return backing.backing.at(lgv).first;

From 17059262cc4ff91bff86888feb0278116149b3f1 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 17:04:10 -0800
Subject: [PATCH 037/113] Wire up rest of Realm implementation.

---
 .../parallel_computation_graph_instance.h     |  19 +--
 .../realm-execution/tasks/impl/op_task.h      |   8 +-
 .../parallel_computation_graph_instance.cc    | 159 +++++++++++++++---
 .../tasks/impl/device_init_task.cc            |  13 +-
 .../src/realm-execution/tasks/impl/op_task.cc |  49 +++++-
 5 files changed, 206 insertions(+), 42 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
index f361cec3ca..0886dcf4c0 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -23,30 +23,27 @@ namespace FlexFlow {
 struct ParallelComputationGraphInstance {
 public:
   ParallelComputationGraphInstance(RealmContext &,
-                                   DynamicOpenDataflowGraph,
                                    std::vector<DynamicNodeInvocation> const &,
                                    OptimizerAttrs const &,
                                    std::optional<LossAttrs> const &,
-                                   std::optional<GenericTensorAccessorW>);
-  DynamicOpenDataflowGraph const &get_dynamic_dataflow_graph() const;
-  Allocator &get_allocator() const;
-  std::vector<DynamicNodeInvocation> const &get_topological_ordering() const;
+                                   std::optional<Realm::RegionInstance>);
+  RealmContext &get_realm_context();
+  std::vector<DynamicNodeInvocation> const &get_execution_order() const;
   OptimizerAttrs const &get_optimizer_attrs() const;
   void update_optimizer_attrs_for_next_iter();
   std::optional<LossAttrs> const &get_loss_attrs() const;
-  std::optional<GenericTensorAccessorR> get_loss_tensor_accessor() const;
+  std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
 
 private:
-  RealmContext &realm;
-  DynamicOpenDataflowGraph dataflow_graph;
-  std::vector<DynamicNodeInvocation> topological_ordering;
+  RealmContext &ctx;
+  std::vector<DynamicNodeInvocation> execution_order;
   OptimizerAttrs optimizer_attrs;
   std::optional<LossAttrs> loss_attrs;
-  std::optional<GenericTensorAccessorW> logit_grad_tensor;
+  std::optional<Realm::RegionInstance> logit_grad_tensor;
 };
 
 ParallelComputationGraphInstance create_parallel_computation_graph_instance(
-    RealmContext &realm,
+    RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 4c3e6d38d1..dd75ed66ea 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -1,10 +1,13 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_OP_TASK_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_OP_TASK_H
 
+#include "kernels/profiling_settings.dtg.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
 
 namespace FlexFlow {
 
@@ -12,8 +15,11 @@ void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
 Realm::Event
     spawn_op_task(RealmContext &ctx,
-                  Realm::Processor &target_proc,
+                  Realm::Processor target_proc,
                   DynamicNodeInvocation const &invocation,
+                  ProfilingSettings const &profiling_settings,
+                  std::optional<LossAttrs> const &loss_attrs,
+                  FFIterationConfig const &iteration_config,
                   std::optional<OptimizerAttrs> const &optimizer_attrs);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index cdb3e5fe46..2683d019c3 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -2,6 +2,8 @@
 #include "pcg/optimizer_attrs.h"
 #include "realm-execution/distributed_device_state_initialization.h"
 #include "realm-execution/instance_allocation.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tasks/impl/op_task.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion.h"
@@ -9,33 +11,27 @@
 #include "task-spec/dynamic_graph/pass_expansion.h"
 #include "task-spec/dynamic_graph/shard_expansion.h"
 #include "task-spec/dynamic_graph/update_insertion.h"
-#include "utils/exception.h"
+#include "utils/graph/digraph/algorithms/get_topological_ordering.h"
 #include "utils/optional.h"
 
 namespace FlexFlow {
 
 ParallelComputationGraphInstance::ParallelComputationGraphInstance(
-    RealmContext &realm,
-    DynamicOpenDataflowGraph dataflow_graph,
-    std::vector<DynamicNodeInvocation> const &topological_ordering,
+    RealmContext &ctx,
+    std::vector<DynamicNodeInvocation> const &execution_order,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<LossAttrs> const &loss_attrs,
-    std::optional<GenericTensorAccessorW> logit_grad_tensor)
-    : realm(realm), dataflow_graph(dataflow_graph),
-      topological_ordering(topological_ordering),
+    std::optional<Realm::RegionInstance> logit_grad_tensor)
+    : ctx(ctx), execution_order(execution_order),
       optimizer_attrs(optimizer_attrs), loss_attrs(loss_attrs),
       logit_grad_tensor(logit_grad_tensor) {}
 
-DynamicOpenDataflowGraph const &
-    ParallelComputationGraphInstance::get_dynamic_dataflow_graph() const {
-  return this->dataflow_graph;
-}
-Allocator &ParallelComputationGraphInstance::get_allocator() const {
-  return this->realm.get_current_device_allocator();
+RealmContext &ParallelComputationGraphInstance::get_realm_context() {
+  return this->ctx;
 }
 std::vector<DynamicNodeInvocation> const &
-    ParallelComputationGraphInstance::get_topological_ordering() const {
-  return this->topological_ordering;
+    ParallelComputationGraphInstance::get_execution_order() const {
+  return this->execution_order;
 }
 OptimizerAttrs const &
     ParallelComputationGraphInstance::get_optimizer_attrs() const {
@@ -49,8 +45,8 @@ std::optional<LossAttrs> const &
     ParallelComputationGraphInstance::get_loss_attrs() const {
   return this->loss_attrs;
 }
-std::optional<GenericTensorAccessorR>
-    ParallelComputationGraphInstance::get_loss_tensor_accessor() const {
+std::optional<Realm::RegionInstance>
+    ParallelComputationGraphInstance::get_loss_tensor_instance() const {
   return this->logit_grad_tensor;
 }
 
@@ -88,7 +84,7 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
   TensorInstanceBacking backing = perform_instance_allocation(dg, inputs, ctx);
 
   // FIXME: for now we're going to be lazy and block on everything rather than
-  // do fine-grained dependencies
+  // do fine-grained dependencies on instances
   ctx.get_outstanding_events().wait();
 
   std::optional<Realm::RegionInstance> logit_grad_tensor =
@@ -98,13 +94,134 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
 
   dg = perform_distributed_device_state_initialization(
       dg, ctx, profiling_settings, iteration_config, optimizer_attrs);
-  NOT_IMPLEMENTED();
+
+  // Compute the topological ordering of the graph
+  auto [kwarg_graph, node_map] =
+      labelled_open_kwarg_dataflow_graph_from_dynamic_open_dataflow_graph(dg);
+  std::vector<Node> node_topo_order = get_topological_ordering(kwarg_graph);
+  std::vector<DynamicNodeInvocation> invocation_topo_order = transform(
+      node_topo_order, [&](Node node) { return node_map.at_l(node); });
+
+  return ParallelComputationGraphInstance{ctx,
+                                          invocation_topo_order,
+                                          optimizer_attrs,
+                                          loss_attrs,
+                                          logit_grad_tensor};
 
   // TODO list:
-  //  * per-device state initialization (RPC mechanism?)
   //  * Realm allocator
-  //  * task body
   //  * external instances
 }
 
+static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    execute_distributed_dynamic_node_invocation_set(
+        RealmContext &ctx,
+        std::vector<DynamicNodeInvocation> const &invocations,
+        OptimizerAttrs const &optimizer_attrs,
+        ProfilingSettings const &profiling_settings,
+        std::optional<LossAttrs> const &loss_attrs,
+        FFIterationConfig iteration_config) {
+  return unordered_map_from_pairs(
+      transform(invocations, [&](DynamicNodeInvocation const &invocation) {
+        Realm::Event result =
+            spawn_op_task(ctx,
+                          ctx.map_device_coord_to_processor(assert_unwrap(
+                              invocation.node_attrs.device_coord)),
+                          invocation,
+                          profiling_settings,
+                          loss_attrs,
+                          iteration_config,
+                          optimizer_attrs);
+        return std::pair{invocation.node_attrs.layer_guid, result};
+      }));
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_all_passes_for_parallel_computation_graph_instance(
+        ParallelComputationGraphInstance &instance,
+        ProfilingSettings const &profiling_settings,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> const &execution_order =
+      instance.get_execution_order();
+  std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
+      execute_distributed_dynamic_node_invocation_set(
+          /*ctx=*/instance.get_realm_context(),
+          /*invocations=*/execution_order,
+          /*optimizer_attrs=*/instance.get_optimizer_attrs(),
+          /*profiling_settings=*/profiling_settings,
+          /*loss_attrs=*/instance.get_loss_attrs(),
+          /*iteration_config=*/iteration_config);
+  instance.update_optimizer_attrs_for_next_iter();
+  return result;
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_forward_pass_for_parallel_computation_graph_instance(
+        ParallelComputationGraphInstance &instance,
+        ProfilingSettings const &profiling_settings,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> const &execution_order =
+      filter(instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::FWD;
+             });
+
+  return execute_distributed_dynamic_node_invocation_set(
+      /*ctx=*/instance.get_realm_context(),
+      /*invocations=*/execution_order,
+      /*optimizer_attrs=*/instance.get_optimizer_attrs(),
+      /*profiling_settings=*/profiling_settings,
+      /*loss_attrs=*/instance.get_loss_attrs(),
+      /*iteration_config=*/iteration_config);
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_backward_pass_for_parallel_computation_graph_instance(
+        ParallelComputationGraphInstance &instance,
+        ProfilingSettings const &profiling_settings,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> const &execution_order =
+      filter(instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::BWD;
+             });
+
+  return execute_distributed_dynamic_node_invocation_set(
+      /*ctx=*/instance.get_realm_context(),
+      /*invocations=*/execution_order,
+      /*optimizer_attrs=*/instance.get_optimizer_attrs(),
+      /*profiling_settings=*/profiling_settings,
+      /*loss_attrs=*/instance.get_loss_attrs(),
+      /*iteration_config=*/iteration_config);
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_update_pass_for_parallel_computation_graph_instance(
+        ParallelComputationGraphInstance &instance,
+        ProfilingSettings const &profiling_settings,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> const &execution_order =
+      filter(instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::UPD;
+             });
+
+  std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
+      execute_distributed_dynamic_node_invocation_set(
+          /*ctx=*/instance.get_realm_context(),
+          /*invocations=*/execution_order,
+          /*optimizer_attrs=*/instance.get_optimizer_attrs(),
+          /*profiling_settings=*/profiling_settings,
+          /*loss_attrs=*/instance.get_loss_attrs(),
+          /*iteration_config=*/iteration_config);
+  instance.update_optimizer_attrs_for_next_iter();
+  return result;
+}
+
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
index 91b753d639..49b5568d26 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
@@ -3,6 +3,7 @@
 #include "realm-execution/tasks/impl/device_init_return_task.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
+#include "task-spec/device_specific_per_device_op_state.dtg.h"
 #include "utils/optional.h"
 #include <optional>
 #include <type_traits>
@@ -43,7 +44,7 @@ void device_init_task_body(void const *args,
   DeviceInitTaskArgs task_args =
       *reinterpret_cast<DeviceInitTaskArgs const *>(args);
 
-  // FIXME: not safe to dereference unless we're on the same address space
+  // FIXME: serialize instead of passing pointers around
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
 
   RealmContext ctx{proc};
@@ -55,11 +56,15 @@ void device_init_task_body(void const *args,
                       *task_args.iteration_config,
                       *task_args.optimizer_attrs,
                       ctx.get_current_device_idx());
-  std::optional<DeviceSpecificPerDeviceOpState> result_state =
-      result_invocation.node_attrs.per_device_op_state;
+  DeviceSpecificPerDeviceOpState result_state =
+      assert_unwrap(result_invocation.node_attrs.per_device_op_state);
+  // Important: to make sure this doesn't get deallocated, we intentionally leak
+  // the allocation here
+  DeviceSpecificPerDeviceOpState *result_state_ptr =
+      new DeviceSpecificPerDeviceOpState{result_state};
   spawn_device_init_return_task(ctx,
                                 task_args.origin_proc,
-                                assert_unwrap(result_state),
+                                *result_state_ptr,
                                 task_args.origin_result_ptr);
 }
 
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index 9d9a36e2d5..79c152844b 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -1,5 +1,7 @@
 #include "realm-execution/tasks/impl/op_task.h"
+#include "local-execution/task_execution.h"
 #include "realm-execution/tasks/task_id_t.h"
+#include "task-spec/per_device_op_state.h"
 #include "utils/optional.h"
 #include <type_traits>
 
@@ -9,8 +11,24 @@ namespace FlexFlow {
 // now just pass the pointer and assume we're running inside a single address
 // space
 struct OpTaskArgs {
+public:
+  OpTaskArgs() = delete;
+  OpTaskArgs(DynamicNodeInvocation const *invocation,
+             ProfilingSettings const *profiling_settings,
+             std::optional<LossAttrs> const *loss_attrs,
+             FFIterationConfig const *iteration_config,
+             std::optional<OptimizerAttrs> const *optimizer_attrs,
+             Realm::Processor origin_proc)
+      : invocation(invocation), profiling_settings(profiling_settings),
+        loss_attrs(loss_attrs), iteration_config(iteration_config),
+        optimizer_attrs(optimizer_attrs) {}
+
 public:
   DynamicNodeInvocation const *invocation;
+  ProfilingSettings const *profiling_settings;
+  std::optional<LossAttrs> const *loss_attrs;
+  FFIterationConfig const *iteration_config;
+  std::optional<OptimizerAttrs> const *optimizer_attrs;
   Realm::Processor origin_proc;
 };
 static_assert(std::has_unique_object_representations_v<OpTaskArgs>);
@@ -23,20 +41,41 @@ void op_task_body(void const *args,
   ASSERT(arglen == sizeof(OpTaskArgs));
   OpTaskArgs task_args = *reinterpret_cast<OpTaskArgs const *>(args);
 
-  // FIXME: not safe to dereference unless we're on the same address space
+  // FIXME: serialize instead of passing pointers around
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
 
   RealmContext ctx{proc};
-  NOT_IMPLEMENTED();
+  execute_dynamic_node_invocation(
+      /*invocation=*/*task_args.invocation,
+      /*allocator=*/ctx.get_current_device_allocator(),
+      /*profiling_settings=*/*task_args.profiling_settings,
+      /*ff_handle=*/ctx.get_current_device_handle(),
+      /*loss_attrs=*/*task_args.loss_attrs,
+      /*per_device_op_state=*/
+      transform(task_args.invocation->node_attrs.per_device_op_state,
+                [&](DeviceSpecificPerDeviceOpState const &op_state) {
+                  return get_device_state_from_device_specific(
+                      op_state, ctx.get_current_device_idx());
+                }),
+      /*iteration_config=*/*task_args.iteration_config,
+      /*optimizer_attrs=*/*task_args.optimizer_attrs,
+      /*device_idx=*/ctx.get_current_device_idx());
 }
 
 Realm::Event
     spawn_op_task(RealmContext &ctx,
-                  Realm::Processor &target_proc,
+                  Realm::Processor target_proc,
                   DynamicNodeInvocation const &invocation,
+                  ProfilingSettings const &profiling_settings,
+                  std::optional<LossAttrs> const &loss_attrs,
+                  FFIterationConfig const &iteration_config,
                   std::optional<OptimizerAttrs> const &optimizer_attrs) {
-  OpTaskArgs task_args;
-  task_args.invocation = &invocation;
+  OpTaskArgs task_args{&invocation,
+                       &profiling_settings,
+                       &loss_attrs,
+                       &iteration_config,
+                       &optimizer_attrs,
+                       ctx.get_current_processor()};
   return ctx.spawn_task(
       target_proc,
       assert_unwrap(get_task_id_for_op(invocation.node_attrs, optimizer_attrs)),

From ae265688351883c1c0d701b9528b553fd3208d7e Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 10 Feb 2026 17:16:31 -0800
Subject: [PATCH 038/113] Implement Realm device idx.

---
 .../include/realm-execution/realm_context.h   |  2 +-
 .../src/realm-execution/realm_context.cc      | 26 +++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 422c4f4027..e28e91234e 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -29,7 +29,7 @@ struct RealmContext {
   Realm::Processor get_current_processor() const;
   Allocator &get_current_device_allocator() const;
   device_handle_t const &get_current_device_handle() const;
-  device_id_t const &get_current_device_idx() const;
+  device_id_t get_current_device_idx() const;
 
   // Task creation
   Realm::Event spawn_task(Realm::Processor proc,
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 7e6c73c9e7..781561c95a 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -1,6 +1,7 @@
 #include "realm-execution/realm_context.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/tensor_dims.dtg.h"
+#include "pcg/device_id_t.h"
 #include "pcg/device_type.dtg.h"
 #include "realm-execution/tasks/realm_task_id_t.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
@@ -70,8 +71,29 @@ Allocator &RealmContext::get_current_device_allocator() const {
 device_handle_t const &RealmContext::get_current_device_handle() const {
   NOT_IMPLEMENTED();
 }
-device_id_t const &RealmContext::get_current_device_idx() const {
-  NOT_IMPLEMENTED();
+device_id_t RealmContext::get_current_device_idx() const {
+  Realm::Processor proc = this->get_current_processor();
+
+  // FIXME: find a more efficient way to implement this than scanning the
+  // machine every time
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  pq.same_address_space_as(proc);
+  nonnegative_int idx{0};
+  for (Realm::Processor p : pq) {
+    if (p == proc) {
+      break;
+    }
+    idx++;
+  }
+
+  switch (proc.kind()) {
+    case Realm::Processor::LOC_PROC:
+      return make_device_id_t_from_idx(idx, DeviceType::CPU);
+    case Realm::Processor::TOC_PROC:
+      return make_device_id_t_from_idx(idx, DeviceType::GPU);
+    default:
+      PANIC("Unhandled Realm::ProcessorKind", fmt::to_string(int{proc.kind()}));
+  }
 }
 
 Realm::Event

From 9fc431230b45322bb80e860b23d26665e2c81ee2 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 09:47:00 -0800
Subject: [PATCH 039/113] Updates to compile against latest local-execution.

---
 .../parallel_computation_graph_instance.h     |  3 --
 .../realm-execution/tasks/impl/op_task.h      |  1 -
 .../parallel_computation_graph_instance.cc    | 32 ++++++-------------
 .../tasks/impl/device_init_task.cc            | 11 +++++--
 .../src/realm-execution/tasks/impl/op_task.cc |  8 +----
 .../src/realm-execution/tasks/task_id_t.cc    | 12 ++++---
 ...e_dynamic_open_dataflow_graph_from_mpcg.cc |  2 +-
 7 files changed, 28 insertions(+), 41 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
index 0886dcf4c0..de06f457e2 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -25,20 +25,17 @@ struct ParallelComputationGraphInstance {
   ParallelComputationGraphInstance(RealmContext &,
                                    std::vector<DynamicNodeInvocation> const &,
                                    OptimizerAttrs const &,
-                                   std::optional<LossAttrs> const &,
                                    std::optional<Realm::RegionInstance>);
   RealmContext &get_realm_context();
   std::vector<DynamicNodeInvocation> const &get_execution_order() const;
   OptimizerAttrs const &get_optimizer_attrs() const;
   void update_optimizer_attrs_for_next_iter();
-  std::optional<LossAttrs> const &get_loss_attrs() const;
   std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
 
 private:
   RealmContext &ctx;
   std::vector<DynamicNodeInvocation> execution_order;
   OptimizerAttrs optimizer_attrs;
-  std::optional<LossAttrs> loss_attrs;
   std::optional<Realm::RegionInstance> logit_grad_tensor;
 };
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index dd75ed66ea..3fcffc30fa 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -18,7 +18,6 @@ Realm::Event
                   Realm::Processor target_proc,
                   DynamicNodeInvocation const &invocation,
                   ProfilingSettings const &profiling_settings,
-                  std::optional<LossAttrs> const &loss_attrs,
                   FFIterationConfig const &iteration_config,
                   std::optional<OptimizerAttrs> const &optimizer_attrs);
 
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
index 2683d019c3..05dfec74c3 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
@@ -20,11 +20,9 @@ ParallelComputationGraphInstance::ParallelComputationGraphInstance(
     RealmContext &ctx,
     std::vector<DynamicNodeInvocation> const &execution_order,
     OptimizerAttrs const &optimizer_attrs,
-    std::optional<LossAttrs> const &loss_attrs,
     std::optional<Realm::RegionInstance> logit_grad_tensor)
     : ctx(ctx), execution_order(execution_order),
-      optimizer_attrs(optimizer_attrs), loss_attrs(loss_attrs),
-      logit_grad_tensor(logit_grad_tensor) {}
+      optimizer_attrs(optimizer_attrs), logit_grad_tensor(logit_grad_tensor) {}
 
 RealmContext &ParallelComputationGraphInstance::get_realm_context() {
   return this->ctx;
@@ -41,10 +39,6 @@ void ParallelComputationGraphInstance::update_optimizer_attrs_for_next_iter() {
   this->optimizer_attrs =
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
-std::optional<LossAttrs> const &
-    ParallelComputationGraphInstance::get_loss_attrs() const {
-  return this->loss_attrs;
-}
 std::optional<Realm::RegionInstance>
     ParallelComputationGraphInstance::get_loss_tensor_instance() const {
   return this->logit_grad_tensor;
@@ -102,15 +96,15 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
   std::vector<DynamicNodeInvocation> invocation_topo_order = transform(
       node_topo_order, [&](Node node) { return node_map.at_l(node); });
 
-  return ParallelComputationGraphInstance{ctx,
-                                          invocation_topo_order,
-                                          optimizer_attrs,
-                                          loss_attrs,
-                                          logit_grad_tensor};
+  return ParallelComputationGraphInstance{
+      ctx, invocation_topo_order, optimizer_attrs, logit_grad_tensor};
 
   // TODO list:
   //  * Realm allocator
   //  * external instances
+  //  * dependencies
+  //  * task argument serializer
+  //  * copies
 }
 
 static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
@@ -119,7 +113,6 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         std::vector<DynamicNodeInvocation> const &invocations,
         OptimizerAttrs const &optimizer_attrs,
         ProfilingSettings const &profiling_settings,
-        std::optional<LossAttrs> const &loss_attrs,
         FFIterationConfig iteration_config) {
   return unordered_map_from_pairs(
       transform(invocations, [&](DynamicNodeInvocation const &invocation) {
@@ -129,7 +122,6 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
                               invocation.node_attrs.device_coord)),
                           invocation,
                           profiling_settings,
-                          loss_attrs,
                           iteration_config,
                           optimizer_attrs);
         return std::pair{invocation.node_attrs.layer_guid, result};
@@ -141,7 +133,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         ParallelComputationGraphInstance &instance,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
-  std::vector<DynamicNodeInvocation> const &execution_order =
+  std::vector<DynamicNodeInvocation> execution_order =
       instance.get_execution_order();
   std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
       execute_distributed_dynamic_node_invocation_set(
@@ -149,7 +141,6 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
           /*invocations=*/execution_order,
           /*optimizer_attrs=*/instance.get_optimizer_attrs(),
           /*profiling_settings=*/profiling_settings,
-          /*loss_attrs=*/instance.get_loss_attrs(),
           /*iteration_config=*/iteration_config);
   instance.update_optimizer_attrs_for_next_iter();
   return result;
@@ -160,7 +151,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         ParallelComputationGraphInstance &instance,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
-  std::vector<DynamicNodeInvocation> const &execution_order =
+  std::vector<DynamicNodeInvocation> execution_order =
       filter(instance.get_execution_order(),
              [](DynamicNodeInvocation const &invocation) {
                DynamicTaskType task_type =
@@ -173,7 +164,6 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
       /*invocations=*/execution_order,
       /*optimizer_attrs=*/instance.get_optimizer_attrs(),
       /*profiling_settings=*/profiling_settings,
-      /*loss_attrs=*/instance.get_loss_attrs(),
       /*iteration_config=*/iteration_config);
 }
 
@@ -182,7 +172,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         ParallelComputationGraphInstance &instance,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
-  std::vector<DynamicNodeInvocation> const &execution_order =
+  std::vector<DynamicNodeInvocation> execution_order =
       filter(instance.get_execution_order(),
              [](DynamicNodeInvocation const &invocation) {
                DynamicTaskType task_type =
@@ -195,7 +185,6 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
       /*invocations=*/execution_order,
       /*optimizer_attrs=*/instance.get_optimizer_attrs(),
       /*profiling_settings=*/profiling_settings,
-      /*loss_attrs=*/instance.get_loss_attrs(),
       /*iteration_config=*/iteration_config);
 }
 
@@ -204,7 +193,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         ParallelComputationGraphInstance &instance,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
-  std::vector<DynamicNodeInvocation> const &execution_order =
+  std::vector<DynamicNodeInvocation> execution_order =
       filter(instance.get_execution_order(),
              [](DynamicNodeInvocation const &invocation) {
                DynamicTaskType task_type =
@@ -218,7 +207,6 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
           /*invocations=*/execution_order,
           /*optimizer_attrs=*/instance.get_optimizer_attrs(),
           /*profiling_settings=*/profiling_settings,
-          /*loss_attrs=*/instance.get_loss_attrs(),
           /*iteration_config=*/iteration_config);
   instance.update_optimizer_attrs_for_next_iter();
   return result;
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
index 49b5568d26..cc080255e2 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
@@ -4,6 +4,7 @@
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
+#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
 #include "utils/optional.h"
 #include <optional>
 #include <type_traits>
@@ -85,9 +86,13 @@ std::optional<Realm::Event>
       result_ptr,
   };
 
-  std::optional<task_id_t> task_id = get_init_task_id_for_op_attrs(
-      assert_unwrap(invocation.node_attrs.op_attrs));
-  if (task_id) {
+  std::optional<task_id_t> task_id =
+      and_then(and_then(invocation.node_attrs.op_attrs,
+                        [](TrainingOperationAttrs const &op_attrs) {
+                          return op_attrs.try_require_pcg_op();
+                        }),
+               get_init_task_id_for_op_attrs);
+  if (task_id.has_value()) {
     return ctx.spawn_task(target_proc,
                           assert_unwrap(task_id),
                           &task_args,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index 79c152844b..5f6ab40607 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -15,18 +15,15 @@ struct OpTaskArgs {
   OpTaskArgs() = delete;
   OpTaskArgs(DynamicNodeInvocation const *invocation,
              ProfilingSettings const *profiling_settings,
-             std::optional<LossAttrs> const *loss_attrs,
              FFIterationConfig const *iteration_config,
              std::optional<OptimizerAttrs> const *optimizer_attrs,
              Realm::Processor origin_proc)
       : invocation(invocation), profiling_settings(profiling_settings),
-        loss_attrs(loss_attrs), iteration_config(iteration_config),
-        optimizer_attrs(optimizer_attrs) {}
+        iteration_config(iteration_config), optimizer_attrs(optimizer_attrs) {}
 
 public:
   DynamicNodeInvocation const *invocation;
   ProfilingSettings const *profiling_settings;
-  std::optional<LossAttrs> const *loss_attrs;
   FFIterationConfig const *iteration_config;
   std::optional<OptimizerAttrs> const *optimizer_attrs;
   Realm::Processor origin_proc;
@@ -50,7 +47,6 @@ void op_task_body(void const *args,
       /*allocator=*/ctx.get_current_device_allocator(),
       /*profiling_settings=*/*task_args.profiling_settings,
       /*ff_handle=*/ctx.get_current_device_handle(),
-      /*loss_attrs=*/*task_args.loss_attrs,
       /*per_device_op_state=*/
       transform(task_args.invocation->node_attrs.per_device_op_state,
                 [&](DeviceSpecificPerDeviceOpState const &op_state) {
@@ -67,12 +63,10 @@ Realm::Event
                   Realm::Processor target_proc,
                   DynamicNodeInvocation const &invocation,
                   ProfilingSettings const &profiling_settings,
-                  std::optional<LossAttrs> const &loss_attrs,
                   FFIterationConfig const &iteration_config,
                   std::optional<OptimizerAttrs> const &optimizer_attrs) {
   OpTaskArgs task_args{&invocation,
                        &profiling_settings,
-                       &loss_attrs,
                        &iteration_config,
                        &optimizer_attrs,
                        ctx.get_current_processor()};
diff --git a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
index 5a99f2bea8..94e1b887e7 100644
--- a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
@@ -2,6 +2,7 @@
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "utils/optional.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -9,14 +10,17 @@ namespace FlexFlow {
 std::optional<task_id_t>
     get_task_id_for_op(DynamicNodeAttrs const &node_attrs,
                        std::optional<OptimizerAttrs> const &optimizer_attrs) {
-  DynamicTaskType task_type = node_attrs.task_type.value();
+  DynamicTaskType task_type = assert_unwrap(node_attrs.task_type);
   switch (task_type) {
     case DynamicTaskType::FWD:
-      return get_fwd_task_id_for_op_attrs(node_attrs.op_attrs.value());
+      return get_fwd_task_id_for_op_attrs(
+          assert_unwrap(node_attrs.op_attrs).require_pcg_op());
     case DynamicTaskType::BWD:
-      return get_bwd_task_id_for_op_attrs(node_attrs.op_attrs.value());
+      return get_bwd_task_id_for_op_attrs(
+          assert_unwrap(node_attrs.op_attrs).require_pcg_op());
     case DynamicTaskType::UPD:
-      return get_update_task_id_for_optimizer_attrs(optimizer_attrs.value());
+      return get_update_task_id_for_optimizer_attrs(
+          assert_unwrap(optimizer_attrs));
     case DynamicTaskType::LOSS:
       return task_id_t::LOSS_BWD_TASK_ID;
     default:
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
index e90ef10398..ced98dfd44 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
@@ -23,7 +23,7 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
         /*task_type=*/std::nullopt,
         /*device_coord=*/std::nullopt,
         /*mapping=*/mpcg.mapped_tasks.at(layer),
-        /*op_attrs=*/attrs.op_attrs,
+        /*op_attrs=*/TrainingOperationAttrs{attrs.op_attrs},
         /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
         /*per_device_op_state=*/std::nullopt,
     };

From 4e706ffaded6485dbbbf50430bf7bd653797b68e Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 10:01:14 -0800
Subject: [PATCH 040/113] Fix up function arguments.

---
 .../distributed_device_state_initialization.h         |  2 +-
 .../include/realm-execution/instance_allocation.h     | 11 ++++++-----
 .../parallel_computation_graph_instance.h             |  9 +++++----
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
index 4121f10341..d2ed093c0b 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
@@ -10,7 +10,7 @@
 namespace FlexFlow {
 
 DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
-    DynamicOpenDataflowGraph const &,
+    DynamicOpenDataflowGraph const &dg,
     RealmContext &ctx,
     ProfilingSettings const &profiling_settings,
     FFIterationConfig const &iteration_config,
diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
index 59065694e9..09709201ce 100644
--- a/lib/realm-execution/include/realm-execution/instance_allocation.h
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -7,15 +7,16 @@
 
 namespace FlexFlow {
 
-DynamicValueAttrs
-    perform_instance_allocation_for_value(DynamicValueAttrs const &,
-                                          Allocator &);
+std::pair<Realm::RegionInstance, Realm::Event>
+    perform_instance_allocation_for_value(DynamicNodeAttrs const &node,
+                                          DynamicValueAttrs const &value,
+                                          RealmContext &ctx);
 
 TensorInstanceBacking perform_instance_allocation(
-    DynamicOpenDataflowGraph const &,
+    DynamicOpenDataflowGraph const &g,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &preallocated,
-    RealmContext &);
+    RealmContext &ctx);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
index de06f457e2..f48879a2bb 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
@@ -22,10 +22,11 @@ namespace FlexFlow {
 
 struct ParallelComputationGraphInstance {
 public:
-  ParallelComputationGraphInstance(RealmContext &,
-                                   std::vector<DynamicNodeInvocation> const &,
-                                   OptimizerAttrs const &,
-                                   std::optional<Realm::RegionInstance>);
+  ParallelComputationGraphInstance(
+      RealmContext &ctx,
+      std::vector<DynamicNodeInvocation> const &execution_order,
+      OptimizerAttrs const &optimizer_attrs,
+      std::optional<Realm::RegionInstance> logit_grad_tensor);
   RealmContext &get_realm_context();
   std::vector<DynamicNodeInvocation> const &get_execution_order() const;
   OptimizerAttrs const &get_optimizer_attrs() const;

From efde0eb379501ed83d005aed686314331fa90d42 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 10:50:16 -0800
Subject: [PATCH 041/113] Rename PCGInstance and add dependency set.

---
 .../realm-execution/atomic_dependency_set.h   | 26 ++++++++++++
 .../include/realm-execution/dependency_set.h  | 34 +++++++++++++++
 .../pcg_instance.h}                           | 13 +++---
 .../realm-execution/atomic_dependency_set.cc  | 23 +++++++++++
 .../src/realm-execution/dependency_set.cc     | 41 +++++++++++++++++++
 .../pcg_instance.cc}                          | 31 +++++++-------
 .../test/src/realm-execution/realm_manager.cc |  1 -
 .../test/src/realm-execution/test_e2e.cc      |  2 +-
 8 files changed, 150 insertions(+), 21 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/atomic_dependency_set.h
 create mode 100644 lib/realm-execution/include/realm-execution/dependency_set.h
 rename lib/realm-execution/include/realm-execution/{parallel_computation_graph_instance/parallel_computation_graph_instance.h => pcg_instance/pcg_instance.h} (84%)
 create mode 100644 lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
 create mode 100644 lib/realm-execution/src/realm-execution/dependency_set.cc
 rename lib/realm-execution/src/realm-execution/{parallel_computation_graph_instance/parallel_computation_graph_instance.cc => pcg_instance/pcg_instance.cc} (90%)

diff --git a/lib/realm-execution/include/realm-execution/atomic_dependency_set.h b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
new file mode 100644
index 0000000000..8a1ae96b3e
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
@@ -0,0 +1,26 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_ATOMIC_DEPENDENCY_SET_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_ATOMIC_DEPENDENCY_SET_H
+
+#include "realm-execution/realm.h"
+#include <vector>
+
+namespace FlexFlow {
+
+struct AtomicDependencySet {
+public:
+  AtomicDependencySet() = delete;
+  explicit AtomicDependencySet(Realm::Event precondition);
+
+  void add_writer(Realm::Event writer);
+  void add_reader(Realm::Event reader);
+
+  Realm::Event get_current_outstanding_events() const;
+
+private:
+  Realm::Event writer;
+  std::vector<Realm::Event> readers;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/dependency_set.h b/lib/realm-execution/include/realm-execution/dependency_set.h
new file mode 100644
index 0000000000..a7100076b2
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/dependency_set.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEPENDENCY_SET_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEPENDENCY_SET_H
+
+#include "realm-execution/atomic_dependency_set.h"
+#include "realm-execution/realm.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include <unordered_map>
+
+namespace FlexFlow {
+
+struct DependencySet {
+public:
+  DependencySet() = delete;
+  explicit DependencySet(Realm::Event precondition);
+
+  void add_writer(DynamicValueAttrs const &value, Realm::Event writer);
+  void add_reader(DynamicValueAttrs const &value, Realm::Event reader);
+
+  Realm::Event
+      get_current_outstanding_events(DynamicValueAttrs const &value) const;
+
+private:
+  AtomicDependencySet &
+      get_atomic_dependency_set(DynamicValueAttrs const &value);
+
+private:
+  Realm::Event precondition;
+  std::unordered_map<DynamicValueAttrs, AtomicDependencySet>
+      atomic_dependencies;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
similarity index 84%
rename from lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
rename to lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
index f48879a2bb..3c5b4189ea 100644
--- a/lib/realm-execution/include/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PARALLEL_COMPUTATION_GRAPH_INSTANCE_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PARALLEL_COMPUTATION_GRAPH_INSTANCE_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
 
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
@@ -20,9 +20,12 @@
 
 namespace FlexFlow {
 
-struct ParallelComputationGraphInstance {
+struct PCGInstance {
 public:
-  ParallelComputationGraphInstance(
+  PCGInstance() = delete;
+  PCGInstance(PCGInstance const &) = delete;
+  PCGInstance(PCGInstance &&) = delete;
+  explicit PCGInstance(
       RealmContext &ctx,
       std::vector<DynamicNodeInvocation> const &execution_order,
       OptimizerAttrs const &optimizer_attrs,
@@ -40,7 +43,7 @@ struct ParallelComputationGraphInstance {
   std::optional<Realm::RegionInstance> logit_grad_tensor;
 };
 
-ParallelComputationGraphInstance create_parallel_computation_graph_instance(
+PCGInstance create_pcg_instance(
     RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
diff --git a/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc b/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
new file mode 100644
index 0000000000..bdc05b7c46
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
@@ -0,0 +1,23 @@
+#include "realm-execution/atomic_dependency_set.h"
+
+namespace FlexFlow {
+
+AtomicDependencySet::AtomicDependencySet(Realm::Event precondition)
+    : writer(precondition) {}
+
+void AtomicDependencySet::add_writer(Realm::Event writer) {
+  this->writer = Realm::Event::merge_events(
+      writer, this->get_current_outstanding_events());
+  this->readers.clear();
+}
+
+void AtomicDependencySet::add_reader(Realm::Event reader) {
+  this->readers.push_back(reader);
+}
+
+Realm::Event AtomicDependencySet::get_current_outstanding_events() const {
+  Realm::Event readers = Realm::Event::merge_events(this->readers);
+  return Realm::Event::merge_events(writer, readers);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/dependency_set.cc b/lib/realm-execution/src/realm-execution/dependency_set.cc
new file mode 100644
index 0000000000..3af03ffcef
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/dependency_set.cc
@@ -0,0 +1,41 @@
+#include "realm-execution/dependency_set.h"
+#include "realm-execution/atomic_dependency_set.h"
+#include "utils/containers/contains_key.h"
+
+namespace FlexFlow {
+
+DependencySet::DependencySet(Realm::Event precondition)
+    : precondition(precondition) {}
+
+void DependencySet::add_writer(DynamicValueAttrs const &value,
+                               Realm::Event writer) {
+  AtomicDependencySet &atomic_dependence_set =
+      this->get_atomic_dependency_set(value);
+  atomic_dependence_set.add_writer(writer);
+}
+
+void DependencySet::add_reader(DynamicValueAttrs const &value,
+                               Realm::Event reader) {
+  AtomicDependencySet &atomic_dependence_set =
+      this->get_atomic_dependency_set(value);
+  atomic_dependence_set.add_reader(reader);
+}
+
+Realm::Event DependencySet::get_current_outstanding_events(
+    DynamicValueAttrs const &value) const {
+  if (contains_key(this->atomic_dependencies, value)) {
+    return this->atomic_dependencies.at(value).get_current_outstanding_events();
+  }
+  return this->precondition;
+}
+
+AtomicDependencySet &
+    DependencySet::get_atomic_dependency_set(DynamicValueAttrs const &value) {
+  if (!contains_key(this->atomic_dependencies, value)) {
+    this->atomic_dependencies.insert(
+        {value, AtomicDependencySet{this->precondition}});
+  }
+  return this->atomic_dependencies.at(value);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
similarity index 90%
rename from lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
rename to lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 05dfec74c3..c1654397ec 100644
--- a/lib/realm-execution/src/realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -1,5 +1,6 @@
-#include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
+#include "realm-execution/pcg_instance/pcg_instance.h"
 #include "pcg/optimizer_attrs.h"
+#include "realm-execution/dependency_set.h"
 #include "realm-execution/distributed_device_state_initialization.h"
 #include "realm-execution/instance_allocation.h"
 #include "realm-execution/realm_context.h"
@@ -16,7 +17,7 @@
 
 namespace FlexFlow {
 
-ParallelComputationGraphInstance::ParallelComputationGraphInstance(
+PCGInstance::PCGInstance(
     RealmContext &ctx,
     std::vector<DynamicNodeInvocation> const &execution_order,
     OptimizerAttrs const &optimizer_attrs,
@@ -24,27 +25,26 @@ ParallelComputationGraphInstance::ParallelComputationGraphInstance(
     : ctx(ctx), execution_order(execution_order),
       optimizer_attrs(optimizer_attrs), logit_grad_tensor(logit_grad_tensor) {}
 
-RealmContext &ParallelComputationGraphInstance::get_realm_context() {
+RealmContext &PCGInstance::get_realm_context() {
   return this->ctx;
 }
 std::vector<DynamicNodeInvocation> const &
-    ParallelComputationGraphInstance::get_execution_order() const {
+    PCGInstance::get_execution_order() const {
   return this->execution_order;
 }
-OptimizerAttrs const &
-    ParallelComputationGraphInstance::get_optimizer_attrs() const {
+OptimizerAttrs const &PCGInstance::get_optimizer_attrs() const {
   return this->optimizer_attrs;
 }
-void ParallelComputationGraphInstance::update_optimizer_attrs_for_next_iter() {
+void PCGInstance::update_optimizer_attrs_for_next_iter() {
   this->optimizer_attrs =
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 std::optional<Realm::RegionInstance>
-    ParallelComputationGraphInstance::get_loss_tensor_instance() const {
+    PCGInstance::get_loss_tensor_instance() const {
   return this->logit_grad_tensor;
 }
 
-ParallelComputationGraphInstance create_parallel_computation_graph_instance(
+PCGInstance create_parallel_computation_graph_instance(
     RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
@@ -96,7 +96,7 @@ ParallelComputationGraphInstance create_parallel_computation_graph_instance(
   std::vector<DynamicNodeInvocation> invocation_topo_order = transform(
       node_topo_order, [&](Node node) { return node_map.at_l(node); });
 
-  return ParallelComputationGraphInstance{
+  return PCGInstance{
       ctx, invocation_topo_order, optimizer_attrs, logit_grad_tensor};
 
   // TODO list:
@@ -114,6 +114,9 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         OptimizerAttrs const &optimizer_attrs,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
+  // For simplicity we'll track a dependency on all outstanding operations up to
+  // this point. This will create an effective barrier between phases.
+  DependencySet dependency_set{ctx.get_outstanding_events()};
   return unordered_map_from_pairs(
       transform(invocations, [&](DynamicNodeInvocation const &invocation) {
         Realm::Event result =
@@ -130,7 +133,7 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_all_passes_for_parallel_computation_graph_instance(
-        ParallelComputationGraphInstance &instance,
+        PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
@@ -148,7 +151,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_forward_pass_for_parallel_computation_graph_instance(
-        ParallelComputationGraphInstance &instance,
+        PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
@@ -169,7 +172,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_backward_pass_for_parallel_computation_graph_instance(
-        ParallelComputationGraphInstance &instance,
+        PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
@@ -190,7 +193,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_update_pass_for_parallel_computation_graph_instance(
-        ParallelComputationGraphInstance &instance,
+        PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index 6c28a001ad..94e0d7d0f4 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -1,5 +1,4 @@
 #include "realm-execution/realm_manager.h"
-#include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index a30d5c4d8e..37f1a9b42c 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -1,4 +1,4 @@
-#include "realm-execution/parallel_computation_graph_instance/parallel_computation_graph_instance.h"
+#include "realm-execution/pcg_instance/pcg_instance.h"
 #include "realm-execution/realm_manager.h"
 #include <doctest/doctest.h>
 

From 40943c146941f52ef981f4e8d3a03b80791f6920 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 11:17:21 -0800
Subject: [PATCH 042/113] Dependency tracking.

---
 .../realm-execution/atomic_dependency_set.h   |  3 +-
 .../include/realm-execution/dependency_set.h  |  4 +-
 .../distributed_device_state_initialization.h |  3 +-
 .../tasks/impl/controller_task.h              |  3 +-
 .../tasks/impl/device_init_return_task.h      |  3 +-
 .../tasks/impl/device_init_task.h             |  3 +-
 .../realm-execution/tasks/impl/op_task.h      | 14 +++----
 .../realm-execution/atomic_dependency_set.cc  | 12 ++++--
 .../src/realm-execution/dependency_set.cc     | 12 +++++-
 ...distributed_device_state_initialization.cc |  6 ++-
 .../pcg_instance/pcg_instance.cc              | 39 +++++++++++++++----
 .../src/realm-execution/realm_manager.cc      |  3 +-
 .../tasks/impl/controller_task.cc             | 12 +++---
 .../tasks/impl/device_init_return_task.cc     |  6 ++-
 .../tasks/impl/device_init_task.cc            |  9 +++--
 .../src/realm-execution/tasks/impl/op_task.cc | 17 ++++----
 16 files changed, 101 insertions(+), 48 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/atomic_dependency_set.h b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
index 8a1ae96b3e..da6ba86638 100644
--- a/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
+++ b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
@@ -14,7 +14,8 @@ struct AtomicDependencySet {
   void add_writer(Realm::Event writer);
   void add_reader(Realm::Event reader);
 
-  Realm::Event get_current_outstanding_events() const;
+  Realm::Event get_dependency_for_writer() const;
+  Realm::Event get_dependency_for_reader() const;
 
 private:
   Realm::Event writer;
diff --git a/lib/realm-execution/include/realm-execution/dependency_set.h b/lib/realm-execution/include/realm-execution/dependency_set.h
index a7100076b2..629a40e2e7 100644
--- a/lib/realm-execution/include/realm-execution/dependency_set.h
+++ b/lib/realm-execution/include/realm-execution/dependency_set.h
@@ -16,8 +16,8 @@ struct DependencySet {
   void add_writer(DynamicValueAttrs const &value, Realm::Event writer);
   void add_reader(DynamicValueAttrs const &value, Realm::Event reader);
 
-  Realm::Event
-      get_current_outstanding_events(DynamicValueAttrs const &value) const;
+  Realm::Event get_dependency_for_writer(DynamicValueAttrs const &value) const;
+  Realm::Event get_dependency_for_reader(DynamicValueAttrs const &value) const;
 
 private:
   AtomicDependencySet &
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
index d2ed093c0b..5530f473d8 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
@@ -14,7 +14,8 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
     RealmContext &ctx,
     ProfilingSettings const &profiling_settings,
     FFIterationConfig const &iteration_config,
-    OptimizerAttrs const &optimizer_attrs);
+    OptimizerAttrs const &optimizer_attrs,
+    Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
index d4c397bb37..7134973ead 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
@@ -12,7 +12,8 @@ void controller_task_body(
 Realm::Event
     collective_spawn_controller_task(RealmContext &ctx,
                                      Realm::Processor &target_proc,
-                                     std::function<void(RealmContext &)> thunk);
+                                     std::function<void(RealmContext &)> thunk,
+                                     Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h
index fc6c8bdb9f..0f92b35c24 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h
@@ -14,7 +14,8 @@ Realm::Event spawn_device_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
     DeviceSpecificPerDeviceOpState const &result,
-    DeviceSpecificPerDeviceOpState *origin_result_ptr);
+    DeviceSpecificPerDeviceOpState *origin_result_ptr,
+    Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
index af07139483..7842963c7b 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
@@ -21,7 +21,8 @@ std::optional<Realm::Event>
                            ProfilingSettings const &profiling_settings,
                            FFIterationConfig const &iteration_config,
                            OptimizerAttrs const &optimizer_attrs,
-                           DeviceSpecificPerDeviceOpState *result_ptr);
+                           DeviceSpecificPerDeviceOpState *result_ptr,
+                           Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 3fcffc30fa..21d8795339 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -13,13 +13,13 @@ namespace FlexFlow {
 
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event
-    spawn_op_task(RealmContext &ctx,
-                  Realm::Processor target_proc,
-                  DynamicNodeInvocation const &invocation,
-                  ProfilingSettings const &profiling_settings,
-                  FFIterationConfig const &iteration_config,
-                  std::optional<OptimizerAttrs> const &optimizer_attrs);
+Realm::Event spawn_op_task(RealmContext &ctx,
+                           Realm::Processor target_proc,
+                           DynamicNodeInvocation const &invocation,
+                           ProfilingSettings const &profiling_settings,
+                           FFIterationConfig const &iteration_config,
+                           std::optional<OptimizerAttrs> const &optimizer_attrs,
+                           Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc b/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
index bdc05b7c46..ba4fcc5a9f 100644
--- a/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
+++ b/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
@@ -6,8 +6,8 @@ AtomicDependencySet::AtomicDependencySet(Realm::Event precondition)
     : writer(precondition) {}
 
 void AtomicDependencySet::add_writer(Realm::Event writer) {
-  this->writer = Realm::Event::merge_events(
-      writer, this->get_current_outstanding_events());
+  this->writer =
+      Realm::Event::merge_events(writer, this->get_dependency_for_writer());
   this->readers.clear();
 }
 
@@ -15,9 +15,13 @@ void AtomicDependencySet::add_reader(Realm::Event reader) {
   this->readers.push_back(reader);
 }
 
-Realm::Event AtomicDependencySet::get_current_outstanding_events() const {
+Realm::Event AtomicDependencySet::get_dependency_for_writer() const {
   Realm::Event readers = Realm::Event::merge_events(this->readers);
-  return Realm::Event::merge_events(writer, readers);
+  return Realm::Event::merge_events(this->writer, readers);
+}
+
+Realm::Event AtomicDependencySet::get_dependency_for_reader() const {
+  return this->writer;
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/dependency_set.cc b/lib/realm-execution/src/realm-execution/dependency_set.cc
index 3af03ffcef..84412a125d 100644
--- a/lib/realm-execution/src/realm-execution/dependency_set.cc
+++ b/lib/realm-execution/src/realm-execution/dependency_set.cc
@@ -21,10 +21,18 @@ void DependencySet::add_reader(DynamicValueAttrs const &value,
   atomic_dependence_set.add_reader(reader);
 }
 
-Realm::Event DependencySet::get_current_outstanding_events(
+Realm::Event DependencySet::get_dependency_for_writer(
     DynamicValueAttrs const &value) const {
   if (contains_key(this->atomic_dependencies, value)) {
-    return this->atomic_dependencies.at(value).get_current_outstanding_events();
+    return this->atomic_dependencies.at(value).get_dependency_for_writer();
+  }
+  return this->precondition;
+}
+
+Realm::Event DependencySet::get_dependency_for_reader(
+    DynamicValueAttrs const &value) const {
+  if (contains_key(this->atomic_dependencies, value)) {
+    return this->atomic_dependencies.at(value).get_dependency_for_reader();
   }
   return this->precondition;
 }
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
index f7fcea87e7..4ea8d0bbd1 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -14,7 +14,8 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
     RealmContext &ctx,
     ProfilingSettings const &profiling_settings,
     FFIterationConfig const &iteration_config,
-    OptimizerAttrs const &optimizer_attrs) {
+    OptimizerAttrs const &optimizer_attrs,
+    Realm::Event precondition) {
 
   // Initialize all operators and save the per-device op state
   ASSERT(no_nodes_are_initialized(dg));
@@ -38,7 +39,8 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
                                profiling_settings,
                                iteration_config,
                                optimizer_attrs,
-                               output);
+                               output,
+                               precondition);
     if (result) {
       result_map[invocation] = output;
     } else {
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index c1654397ec..e636cbf259 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -7,11 +7,14 @@
 #include "realm-execution/tasks/impl/op_task.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion.h"
 #include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
 #include "task-spec/dynamic_graph/pass_expansion.h"
 #include "task-spec/dynamic_graph/shard_expansion.h"
 #include "task-spec/dynamic_graph/update_insertion.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/values.h"
 #include "utils/graph/digraph/algorithms/get_topological_ordering.h"
 #include "utils/optional.h"
 
@@ -77,17 +80,20 @@ PCGInstance create_parallel_computation_graph_instance(
   dg = perform_shard_expansion(dg);
   TensorInstanceBacking backing = perform_instance_allocation(dg, inputs, ctx);
 
-  // FIXME: for now we're going to be lazy and block on everything rather than
-  // do fine-grained dependencies on instances
-  ctx.get_outstanding_events().wait();
-
   std::optional<Realm::RegionInstance> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
         return backing.backing.at(lgv).first;
       });
 
+  // FIXME: for now we're going to be lazy and block on everything rather than
+  // do fine-grained dependencies on instances
   dg = perform_distributed_device_state_initialization(
-      dg, ctx, profiling_settings, iteration_config, optimizer_attrs);
+      dg,
+      ctx,
+      profiling_settings,
+      iteration_config,
+      optimizer_attrs,
+      ctx.get_outstanding_events());
 
   // Compute the topological ordering of the graph
   auto [kwarg_graph, node_map] =
@@ -102,7 +108,6 @@ PCGInstance create_parallel_computation_graph_instance(
   // TODO list:
   //  * Realm allocator
   //  * external instances
-  //  * dependencies
   //  * task argument serializer
   //  * copies
 }
@@ -119,6 +124,19 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
   DependencySet dependency_set{ctx.get_outstanding_events()};
   return unordered_map_from_pairs(
       transform(invocations, [&](DynamicNodeInvocation const &invocation) {
+        std::vector<Realm::Event> input_dependencies =
+            transform(vector_of(values(invocation.inputs)),
+                      [&](DynamicValueAttrs const &value) {
+                        return dependency_set.get_dependency_for_reader(value);
+                      });
+        std::vector<Realm::Event> output_dependencies =
+            transform(vector_of(values(invocation.outputs)),
+                      [&](DynamicValueAttrs const &value) {
+                        return dependency_set.get_dependency_for_writer(value);
+                      });
+        Realm::Event dependencies = Realm::Event::merge_events(
+            Realm::Event::merge_events(input_dependencies),
+            Realm::Event::merge_events(output_dependencies));
         Realm::Event result =
             spawn_op_task(ctx,
                           ctx.map_device_coord_to_processor(assert_unwrap(
@@ -126,7 +144,14 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
                           invocation,
                           profiling_settings,
                           iteration_config,
-                          optimizer_attrs);
+                          optimizer_attrs,
+                          dependencies);
+        for (DynamicValueAttrs const &value : values(invocation.inputs)) {
+          dependency_set.add_reader(value, result);
+        }
+        for (DynamicValueAttrs const &value : values(invocation.outputs)) {
+          dependency_set.add_writer(value, result);
+        }
         return std::pair{invocation.node_attrs.layer_guid, result};
       }));
 }
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 7233103cc3..adafea47e6 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -27,7 +27,8 @@ Realm::Event
           .only_kind(Realm::Processor::LOC_PROC)
           .first();
 
-  return collective_spawn_controller_task(*this, target_proc, thunk);
+  return collective_spawn_controller_task(
+      *this, target_proc, thunk, Realm::Event::NO_EVENT);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
index 2fd5cee52d..285e8acaa7 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
@@ -21,17 +21,19 @@ void controller_task_body(void const *args,
   task_args.thunk(ctx);
 }
 
-Realm::Event collective_spawn_controller_task(
-    RealmContext &ctx,
-    Realm::Processor &target_proc,
-    std::function<void(RealmContext &)> thunk) {
+Realm::Event
+    collective_spawn_controller_task(RealmContext &ctx,
+                                     Realm::Processor &target_proc,
+                                     std::function<void(RealmContext &)> thunk,
+                                     Realm::Event precondition) {
   ControllerTaskArgs task_args;
   task_args.thunk = thunk;
 
   return ctx.collective_spawn_task(target_proc,
                                    task_id_t::CONTROLLER_TASK_ID,
                                    &task_args,
-                                   sizeof(task_args));
+                                   sizeof(task_args),
+                                   precondition);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
index fa421cda30..610500a94b 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
@@ -36,14 +36,16 @@ Realm::Event spawn_device_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
     DeviceSpecificPerDeviceOpState const &result,
-    DeviceSpecificPerDeviceOpState *origin_result_ptr) {
+    DeviceSpecificPerDeviceOpState *origin_result_ptr,
+    Realm::Event precondition) {
   DeviceInitReturnTaskArgs task_args{result, origin_proc, origin_result_ptr};
 
   return ctx.spawn_task(origin_proc,
                         task_id_t::DEVICE_INIT_RETURN_TASK_ID,
                         &task_args,
                         sizeof(task_args),
-                        Realm::ProfilingRequestSet{});
+                        Realm::ProfilingRequestSet{},
+                        precondition);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
index cc080255e2..7f36f48921 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
@@ -66,7 +66,8 @@ void device_init_task_body(void const *args,
   spawn_device_init_return_task(ctx,
                                 task_args.origin_proc,
                                 *result_state_ptr,
-                                task_args.origin_result_ptr);
+                                task_args.origin_result_ptr,
+                                Realm::Event::NO_EVENT);
 }
 
 std::optional<Realm::Event>
@@ -76,7 +77,8 @@ std::optional<Realm::Event>
                            ProfilingSettings const &profiling_settings,
                            FFIterationConfig const &iteration_config,
                            OptimizerAttrs const &optimizer_attrs,
-                           DeviceSpecificPerDeviceOpState *result_ptr) {
+                           DeviceSpecificPerDeviceOpState *result_ptr,
+                           Realm::Event precondition) {
   DeviceInitTaskArgs task_args{
       &invocation,
       &profiling_settings,
@@ -97,7 +99,8 @@ std::optional<Realm::Event>
                           assert_unwrap(task_id),
                           &task_args,
                           sizeof(task_args),
-                          Realm::ProfilingRequestSet{});
+                          Realm::ProfilingRequestSet{},
+                          precondition);
   }
   return std::nullopt;
 }
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index 5f6ab40607..216f0badde 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -58,13 +58,13 @@ void op_task_body(void const *args,
       /*device_idx=*/ctx.get_current_device_idx());
 }
 
-Realm::Event
-    spawn_op_task(RealmContext &ctx,
-                  Realm::Processor target_proc,
-                  DynamicNodeInvocation const &invocation,
-                  ProfilingSettings const &profiling_settings,
-                  FFIterationConfig const &iteration_config,
-                  std::optional<OptimizerAttrs> const &optimizer_attrs) {
+Realm::Event spawn_op_task(RealmContext &ctx,
+                           Realm::Processor target_proc,
+                           DynamicNodeInvocation const &invocation,
+                           ProfilingSettings const &profiling_settings,
+                           FFIterationConfig const &iteration_config,
+                           std::optional<OptimizerAttrs> const &optimizer_attrs,
+                           Realm::Event precondition) {
   OpTaskArgs task_args{&invocation,
                        &profiling_settings,
                        &iteration_config,
@@ -75,7 +75,8 @@ Realm::Event
       assert_unwrap(get_task_id_for_op(invocation.node_attrs, optimizer_attrs)),
       &task_args,
       sizeof(task_args),
-      Realm::ProfilingRequestSet{});
+      Realm::ProfilingRequestSet{},
+      precondition);
 }
 
 } // namespace FlexFlow

From c05f03e52d9a37c814e38c379fa197639a54011e Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 11:18:53 -0800
Subject: [PATCH 043/113] Add event argument to controller.

---
 lib/realm-execution/include/realm-execution/realm_manager.h | 3 ++-
 lib/realm-execution/src/realm-execution/realm_manager.cc    | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index bf5e8f72f1..8a79476bcf 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -19,7 +19,8 @@ struct RealmManager : private RealmContext {
   RealmManager(RealmManager &&) = delete;
 
   [[nodiscard]] Realm::Event
-      start_controller(std::function<void(RealmContext &)>);
+      start_controller(std::function<void(RealmContext &)>,
+                       Realm::Event wait_on = Realm::Event::NO_EVENT);
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index adafea47e6..fc74fffe5d 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -21,14 +21,14 @@ RealmManager::~RealmManager() {
 }
 
 Realm::Event
-    RealmManager::start_controller(std::function<void(RealmContext &)> thunk) {
+    RealmManager::start_controller(std::function<void(RealmContext &)> thunk,
+                                   Realm::Event wait_on) {
   Realm::Processor target_proc =
       Realm::Machine::ProcessorQuery(Realm::Machine::get_machine())
           .only_kind(Realm::Processor::LOC_PROC)
           .first();
 
-  return collective_spawn_controller_task(
-      *this, target_proc, thunk, Realm::Event::NO_EVENT);
+  return collective_spawn_controller_task(*this, target_proc, thunk, wait_on);
 }
 
 } // namespace FlexFlow

From c76f1e5a8b0c6227b69d46adeac29051ec553986 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 11:53:33 -0800
Subject: [PATCH 044/113] Implement the allocator.

---
 .../include/realm-execution/realm_allocator.h | 31 +++++++++++
 .../include/realm-execution/realm_context.h   |  6 ++-
 .../pcg_instance/pcg_instance.cc              |  2 +-
 .../src/realm-execution/realm_allocator.cc    | 53 +++++++++++++++++++
 .../src/realm-execution/realm_context.cc      | 10 ++--
 5 files changed, 95 insertions(+), 7 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/realm_allocator.h
 create mode 100644 lib/realm-execution/src/realm-execution/realm_allocator.cc

diff --git a/lib/realm-execution/include/realm-execution/realm_allocator.h b/lib/realm-execution/include/realm-execution/realm_allocator.h
new file mode 100644
index 0000000000..dab6f3ea63
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_allocator.h
@@ -0,0 +1,31 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_ALLOCATOR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_ALLOCATOR_H
+
+#include "kernels/allocation.h"
+#include "realm-execution/realm.h"
+
+namespace FlexFlow {
+
+struct RealmAllocator : public IAllocator {
+  RealmAllocator(Realm::Processor processor, Realm::Memory memory);
+  RealmAllocator(RealmAllocator const &) = delete;
+  RealmAllocator(RealmAllocator &&) = delete;
+  ~RealmAllocator() = default;
+
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
+private:
+  Realm::Processor processor;
+  Realm::Memory memory;
+  std::unordered_map<void *, Realm::RegionInstance> ptr_instances;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmAllocator);
+
+Allocator get_realm_allocator(Realm::Processor processor, Realm::Memory memory);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index e28e91234e..755bf595d6 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -6,6 +6,7 @@
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "realm-execution/realm.h"
+#include "realm-execution/realm_allocator.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include <unordered_map>
 
@@ -23,11 +24,11 @@ struct RealmContext {
   // Device mapping
   Realm::Processor
       map_device_coord_to_processor(MachineSpaceCoordinate const &);
-  Realm::Memory get_nearest_memory(Realm::Processor) const;
+  static Realm::Memory get_nearest_memory(Realm::Processor);
 
   // Current device context
   Realm::Processor get_current_processor() const;
-  Allocator &get_current_device_allocator() const;
+  Allocator &get_current_device_allocator();
   device_handle_t const &get_current_device_handle() const;
   device_id_t get_current_device_idx() const;
 
@@ -68,6 +69,7 @@ struct RealmContext {
 protected:
   Realm::Runtime runtime;
   Realm::Processor processor;
+  Allocator allocator;
   std::vector<Realm::Event> outstanding_events;
   std::unordered_map<std::pair<Realm::AddressSpace, Realm::Processor::Kind>,
                      std::vector<Realm::Processor>>
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index e636cbf259..93b42743a0 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -106,7 +106,7 @@ PCGInstance create_parallel_computation_graph_instance(
       ctx, invocation_topo_order, optimizer_attrs, logit_grad_tensor};
 
   // TODO list:
-  //  * Realm allocator
+  //  * current device handle
   //  * external instances
   //  * task argument serializer
   //  * copies
diff --git a/lib/realm-execution/src/realm-execution/realm_allocator.cc b/lib/realm-execution/src/realm-execution/realm_allocator.cc
new file mode 100644
index 0000000000..f24106b0bc
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_allocator.cc
@@ -0,0 +1,53 @@
+#include "realm-execution/realm_allocator.h"
+#include "kernels/device.h"
+#include "pcg/device_type.dtg.h"
+
+namespace FlexFlow {
+
+RealmAllocator::RealmAllocator(Realm::Processor processor, Realm::Memory memory)
+    : processor(processor), memory(memory) {}
+
+void *RealmAllocator::allocate(size_t requested_memory_size) {
+  Realm::Rect<1> bounds{Realm::Point<1>::ZEROES(),
+                        Realm::Point<1>{requested_memory_size} -
+                            Realm::Point<1>::ONES()};
+  std::vector<size_t> field_sizes{1};
+  Realm::RegionInstance inst;
+  Realm::Event ready =
+      Realm::RegionInstance::create_instance(inst,
+                                             this->memory,
+                                             bounds,
+                                             field_sizes,
+                                             0 /*SOA*/,
+                                             Realm::ProfilingRequestSet{});
+  ready.wait();
+  void *ptr =
+      inst.pointer_untyped(/*offset=*/0, /*datalen=*/requested_memory_size);
+  ASSERT(ptr);
+  this->ptr_instances.insert({ptr, inst});
+  return ptr;
+}
+
+void RealmAllocator::deallocate(void *ptr) {
+  this->ptr_instances.at(ptr).destroy(Realm::Event::NO_EVENT);
+  this->ptr_instances.erase(ptr);
+}
+
+DeviceType RealmAllocator::get_allocation_device_type() const {
+  switch (this->processor.kind()) {
+    case Realm::Processor::Kind::LOC_PROC:
+      return DeviceType::CPU;
+    case Realm::Processor::Kind::TOC_PROC:
+      return DeviceType::GPU;
+    default:
+      PANIC("Unhandled FwbTensorType", this->processor.kind());
+  }
+}
+
+Allocator get_realm_allocator(Realm::Processor processor,
+                              Realm::Memory memory) {
+  Allocator allocator = Allocator::create<RealmAllocator>(processor, memory);
+  return allocator;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 781561c95a..a77383779f 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -14,7 +14,9 @@
 
 namespace FlexFlow {
 
-RealmContext::RealmContext(Realm::Processor proc) : processor(proc) {}
+RealmContext::RealmContext(Realm::Processor proc)
+    : processor(proc), allocator(get_realm_allocator(
+                           proc, RealmContext::get_nearest_memory(proc))) {}
 
 RealmContext::~RealmContext() {
   if (!this->outstanding_events.empty()) {
@@ -51,7 +53,7 @@ Realm::Processor RealmContext::map_device_coord_to_processor(
   return this->processors.at(std::pair{as, kind}).at(int{proc_in_node});
 }
 
-Realm::Memory RealmContext::get_nearest_memory(Realm::Processor proc) const {
+Realm::Memory RealmContext::get_nearest_memory(Realm::Processor proc) {
   // FIMXE: this isn't going to do what you expect until
   // https://github.com/StanfordLegion/realm/pull/392 merges
   Realm::Machine::MemoryQuery mq(Realm::Machine::get_machine());
@@ -64,8 +66,8 @@ Realm::Processor RealmContext::get_current_processor() const {
   return this->processor;
 }
 
-Allocator &RealmContext::get_current_device_allocator() const {
-  NOT_IMPLEMENTED();
+Allocator &RealmContext::get_current_device_allocator() {
+  return this->allocator;
 }
 
 device_handle_t const &RealmContext::get_current_device_handle() const {

From bde3ea94f268af1525ac358bee2c0a8024832b60 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 12:17:50 -0800
Subject: [PATCH 045/113] Implement device handle.

---
 .../include/realm-execution/realm_allocator.h |  2 +
 .../include/realm-execution/realm_context.h   | 10 ++++-
 .../pcg_instance/pcg_instance.cc              |  1 -
 .../src/realm-execution/realm_context.cc      | 42 ++++++++++++++++---
 4 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_allocator.h b/lib/realm-execution/include/realm-execution/realm_allocator.h
index dab6f3ea63..d72f2d7f91 100644
--- a/lib/realm-execution/include/realm-execution/realm_allocator.h
+++ b/lib/realm-execution/include/realm-execution/realm_allocator.h
@@ -8,6 +8,8 @@ namespace FlexFlow {
 
 struct RealmAllocator : public IAllocator {
   RealmAllocator(Realm::Processor processor, Realm::Memory memory);
+
+  RealmAllocator() = delete;
   RealmAllocator(RealmAllocator const &) = delete;
   RealmAllocator(RealmAllocator &&) = delete;
   ~RealmAllocator() = default;
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 755bf595d6..eb4d6d0935 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -3,18 +3,19 @@
 
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
+#include "kernels/managed_per_device_ff_handle.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "realm-execution/realm.h"
-#include "realm-execution/realm_allocator.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
+#include <optional>
 #include <unordered_map>
 
 namespace FlexFlow {
 
 struct RealmContext {
 public:
-  RealmContext(Realm::Processor);
+  RealmContext(Realm::Processor processor);
   virtual ~RealmContext();
 
   RealmContext() = delete;
@@ -66,10 +67,15 @@ struct RealmContext {
 
   void discover_machine_topology();
 
+  static std::optional<ManagedPerDeviceFFHandle>
+      make_device_handle_for_processor(Realm::Processor processor);
+
 protected:
   Realm::Runtime runtime;
   Realm::Processor processor;
   Allocator allocator;
+  std::optional<ManagedPerDeviceFFHandle> managed_handle;
+  device_handle_t device_handle;
   std::vector<Realm::Event> outstanding_events;
   std::unordered_map<std::pair<Realm::AddressSpace, Realm::Processor::Kind>,
                      std::vector<Realm::Processor>>
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 93b42743a0..d56dbb9ca9 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -106,7 +106,6 @@ PCGInstance create_parallel_computation_graph_instance(
       ctx, invocation_topo_order, optimizer_attrs, logit_grad_tensor};
 
   // TODO list:
-  //  * current device handle
   //  * external instances
   //  * task argument serializer
   //  * copies
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index a77383779f..38ce052da9 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -1,22 +1,27 @@
 #include "realm-execution/realm_context.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_handle_t.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/tensor_dims.dtg.h"
 #include "pcg/device_id_t.h"
 #include "pcg/device_type.dtg.h"
+#include "realm-execution/realm_allocator.h"
 #include "realm-execution/tasks/realm_task_id_t.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/transform.h"
-#include "utils/exception.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 #include "utils/one_to_many/one_to_many.h"
 #include "utils/positive_int/positive_int.h"
 
 namespace FlexFlow {
 
-RealmContext::RealmContext(Realm::Processor proc)
-    : processor(proc), allocator(get_realm_allocator(
-                           proc, RealmContext::get_nearest_memory(proc))) {}
+RealmContext::RealmContext(Realm::Processor processor)
+    : processor(processor),
+      allocator(get_realm_allocator(
+          processor, RealmContext::get_nearest_memory(processor))),
+      managed_handle(RealmContext::make_device_handle_for_processor(processor)),
+      device_handle(device_handle_t_from_managed_handle(managed_handle)) {}
 
 RealmContext::~RealmContext() {
   if (!this->outstanding_events.empty()) {
@@ -54,6 +59,10 @@ Realm::Processor RealmContext::map_device_coord_to_processor(
 }
 
 Realm::Memory RealmContext::get_nearest_memory(Realm::Processor proc) {
+  if (!proc.exists()) {
+    return Realm::Memory::NO_MEMORY;
+  }
+
   // FIMXE: this isn't going to do what you expect until
   // https://github.com/StanfordLegion/realm/pull/392 merges
   Realm::Machine::MemoryQuery mq(Realm::Machine::get_machine());
@@ -71,8 +80,9 @@ Allocator &RealmContext::get_current_device_allocator() {
 }
 
 device_handle_t const &RealmContext::get_current_device_handle() const {
-  NOT_IMPLEMENTED();
+  return this->device_handle;
 }
+
 device_id_t RealmContext::get_current_device_idx() const {
   Realm::Processor proc = this->get_current_processor();
 
@@ -245,4 +255,26 @@ void RealmContext::discover_machine_topology() {
   }
 }
 
+std::optional<ManagedPerDeviceFFHandle>
+    RealmContext::make_device_handle_for_processor(Realm::Processor processor) {
+  if (!processor.exists()) {
+    return std::nullopt;
+  }
+
+  switch (processor.kind()) {
+    case Realm::Processor::LOC_PROC:
+      return std::nullopt;
+    case Realm::Processor::TOC_PROC:
+      // FIXME: not sure what workSpaceSize to choose here
+      return initialize_multi_gpu_handle(
+          /*num_ranks=*/Realm::Machine::get_machine().get_address_space_count(),
+          /*my_rank=*/processor.address_space(),
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+    default:
+      PANIC("Unhandled Realm::ProcessorKind",
+            fmt::to_string(int{processor.kind()}));
+  }
+}
+
 } // namespace FlexFlow

From ebab2bd2f6bcdbee2a2ccc54edbc8d6e1b2831b1 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 14:47:23 -0800
Subject: [PATCH 046/113] Distributed device handle initialization.

---
 .../distributed_device_handle.h               |  38 +++++++
 .../impl/device_handle_init_return_task.h     |  24 +++++
 .../tasks/impl/device_handle_init_task.h      |  24 +++++
 .../tasks/impl/device_init_task.h             |  29 -----
 ...task.h => device_state_init_return_task.h} |   8 +-
 .../tasks/impl/device_state_init_task.h       |  29 +++++
 .../realm-execution/tasks/task_id_t.dtg.toml  |   8 +-
 .../distributed_device_handle.cc              |  50 +++++++++
 ...distributed_device_state_initialization.cc |  18 ++--
 .../impl/device_handle_init_return_task.cc    |  55 ++++++++++
 .../tasks/impl/device_handle_init_task.cc     | 100 ++++++++++++++++++
 .../tasks/impl/device_init_return_task.cc     |  51 ---------
 .../impl/device_state_init_return_task.cc     |  53 ++++++++++
 ...init_task.cc => device_state_init_task.cc} |  67 ++++++------
 .../tasks/realm_task_registry.cc              |  10 +-
 15 files changed, 432 insertions(+), 132 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/distributed_device_handle.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
 delete mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
 rename lib/realm-execution/include/realm-execution/tasks/impl/{device_init_return_task.h => device_state_init_return_task.h} (77%)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
 create mode 100644 lib/realm-execution/src/realm-execution/distributed_device_handle.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
 delete mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
 rename lib/realm-execution/src/realm-execution/tasks/impl/{device_init_task.cc => device_state_init_task.cc} (58%)

diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
new file mode 100644
index 0000000000..ca3f08fc41
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
@@ -0,0 +1,38 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
+
+#include "kernels/managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/device_specific.h"
+#include <map>
+#include <optional>
+
+namespace FlexFlow {
+
+struct DistributedDeviceHandle {
+public:
+  DistributedDeviceHandle() = delete;
+  explicit DistributedDeviceHandle(
+      std::map<Realm::Processor,
+               DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>> const
+          &handles);
+
+  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> const &
+      at(Realm::Processor processor) const;
+
+private:
+  std::map<Realm::Processor,
+           DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>>
+      handles;
+};
+
+DistributedDeviceHandle create_distributed_device_handle(
+    RealmContext &ctx,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    Realm::Event precondition = Realm::Event::NO_EVENT);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
new file mode 100644
index 0000000000..8b358ee4ce
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
+
+#include "kernels/managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/device_specific_per_device_op_state.dtg.h"
+
+namespace FlexFlow {
+
+void device_handle_init_return_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event spawn_device_handle_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> const &result,
+    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>
+        *origin_result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
new file mode 100644
index 0000000000..c26633bd9a
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
+
+#include "kernels/managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/device_specific_per_device_op_state.dtg.h"
+
+namespace FlexFlow {
+
+void device_handle_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event spawn_device_handle_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> *result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
deleted file mode 100644
index 7842963c7b..0000000000
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_task.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_TASK_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_TASK_H
-
-#include "kernels/profiling_settings.dtg.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "realm-execution/realm.h"
-#include "realm-execution/realm_context.h"
-#include "task-spec/device_specific_per_device_op_state.dtg.h"
-#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
-#include "task-spec/ff_iteration_config.dtg.h"
-
-namespace FlexFlow {
-
-void device_init_task_body(
-    void const *, size_t, void const *, size_t, Realm::Processor);
-
-std::optional<Realm::Event>
-    spawn_device_init_task(RealmContext &ctx,
-                           Realm::Processor &target_proc,
-                           DynamicNodeInvocation const &invocation,
-                           ProfilingSettings const &profiling_settings,
-                           FFIterationConfig const &iteration_config,
-                           OptimizerAttrs const &optimizer_attrs,
-                           DeviceSpecificPerDeviceOpState *result_ptr,
-                           Realm::Event precondition);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
similarity index 77%
rename from lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h
rename to lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
index 0f92b35c24..8f44680815 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_RETURN_TASK_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_INIT_RETURN_TASK_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_RETURN_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_RETURN_TASK_H
 
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
@@ -7,10 +7,10 @@
 
 namespace FlexFlow {
 
-void device_init_return_task_body(
+void device_state_init_return_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event spawn_device_init_return_task(
+Realm::Event spawn_device_state_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
     DeviceSpecificPerDeviceOpState const &result,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
new file mode 100644
index 0000000000..4cd65a0a2a
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
@@ -0,0 +1,29 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_TASK_H
+
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/device_specific_per_device_op_state.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+
+namespace FlexFlow {
+
+void device_state_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+std::optional<Realm::Event>
+    spawn_device_state_init_task(RealmContext &ctx,
+                                 Realm::Processor target_proc,
+                                 DynamicNodeInvocation const &invocation,
+                                 ProfilingSettings const &profiling_settings,
+                                 FFIterationConfig const &iteration_config,
+                                 OptimizerAttrs const &optimizer_attrs,
+                                 DeviceSpecificPerDeviceOpState *result_ptr,
+                                 Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
index 34e5183488..97b19b5f51 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
@@ -12,7 +12,13 @@ features = [
 name = "CONTROLLER_TASK_ID"
 
 [[values]]
-name = "DEVICE_INIT_RETURN_TASK_ID"
+name = "DEVICE_HANDLE_INIT_TASK_ID"
+
+[[values]]
+name = "DEVICE_HANDLE_INIT_RETURN_TASK_ID"
+
+[[values]]
+name = "DEVICE_STATE_INIT_RETURN_TASK_ID"
 
 [[values]]
 name = "IMAGE_INIT_TASK_ID"
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
new file mode 100644
index 0000000000..00c2e76360
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
@@ -0,0 +1,50 @@
+#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "task-spec/device_specific.h"
+
+namespace FlexFlow {
+
+DistributedDeviceHandle::DistributedDeviceHandle(
+    std::map<Realm::Processor,
+             DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>> const
+        &handles)
+    : handles(handles) {}
+
+DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> const &
+    DistributedDeviceHandle::at(Realm::Processor processor) const {
+  return this->handles.at(processor);
+}
+
+DistributedDeviceHandle
+    create_distributed_device_handle(RealmContext &ctx,
+                                     size_t workSpaceSize,
+                                     bool allowTensorOpMathConversion,
+                                     Realm::Event precondition) {
+  std::map<Realm::Processor,
+           DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>>
+      handles;
+
+  // Allocate space for the result before launching any tasks
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  for (Realm::Processor proc : pq) {
+    handles.insert(
+        {proc,
+         DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>::create(
+             ctx.get_current_device_idx(), std::nullopt)});
+  }
+
+  for (auto &[proc, handle] : handles) {
+    spawn_device_handle_init_task(ctx,
+                                  proc,
+                                  workSpaceSize,
+                                  allowTensorOpMathConversion,
+                                  &handle,
+                                  precondition);
+  }
+
+  ctx.get_outstanding_events().wait();
+
+  return DistributedDeviceHandle{handles};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
index 4ea8d0bbd1..9627a71e87 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -1,6 +1,6 @@
 #include "realm-execution/distributed_device_state_initialization.h"
 #include "local-execution/device_state_initialization.h"
-#include "realm-execution/tasks/impl/device_init_task.h"
+#include "realm-execution/tasks/impl/device_state_init_task.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "utils/optional.h"
@@ -33,14 +33,14 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
         static_cast<DeviceSpecificPerDeviceOpState *>(
             malloc(sizeof(DeviceSpecificPerDeviceOpState)));
     std::optional<Realm::Event> result =
-        spawn_device_init_task(ctx,
-                               target_proc,
-                               invocation,
-                               profiling_settings,
-                               iteration_config,
-                               optimizer_attrs,
-                               output,
-                               precondition);
+        spawn_device_state_init_task(ctx,
+                                     target_proc,
+                                     invocation,
+                                     profiling_settings,
+                                     iteration_config,
+                                     optimizer_attrs,
+                                     output,
+                                     precondition);
     if (result) {
       result_map[invocation] = output;
     } else {
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
new file mode 100644
index 0000000000..2839beef0c
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
@@ -0,0 +1,55 @@
+#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+// FIXME: Can't make this trivially copyable?
+struct DeviceHandleInitReturnTaskArgs {
+public:
+  DeviceHandleInitReturnTaskArgs() = delete;
+  DeviceHandleInitReturnTaskArgs(
+      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> result,
+      Realm::Processor origin_proc,
+      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>
+          *origin_result_ptr)
+      : result(result), origin_proc(origin_proc),
+        origin_result_ptr(origin_result_ptr) {}
+
+public:
+  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> result;
+  Realm::Processor origin_proc;
+  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> *origin_result_ptr;
+};
+
+void device_handle_init_return_task_body(void const *args,
+                                         size_t arglen,
+                                         void const *userdata,
+                                         size_t userlen,
+                                         Realm::Processor proc) {
+  ASSERT(arglen == sizeof(DeviceHandleInitReturnTaskArgs));
+  DeviceHandleInitReturnTaskArgs task_args =
+      *reinterpret_cast<DeviceHandleInitReturnTaskArgs const *>(args);
+
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+  *task_args.origin_result_ptr = task_args.result;
+}
+
+Realm::Event spawn_device_handle_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> const &result,
+    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>
+        *origin_result_ptr,
+    Realm::Event precondition) {
+  DeviceHandleInitReturnTaskArgs task_args{
+      result, origin_proc, origin_result_ptr};
+
+  return ctx.spawn_task(origin_proc,
+                        task_id_t::DEVICE_HANDLE_INIT_RETURN_TASK_ID,
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
new file mode 100644
index 0000000000..86a576d26b
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
@@ -0,0 +1,100 @@
+#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/tasks/impl/device_handle_init_return_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+// TODO: at some point we're going to have to actually serialize these, but for
+// now just pass the pointer and assume we're running inside a single address
+// space
+struct DeviceHandleInitTaskArgs {
+  DeviceHandleInitTaskArgs() = delete;
+  DeviceHandleInitTaskArgs(
+      size_t workSpaceSize,
+      bool allowTensorOpMathConversion,
+      Realm::Processor origin_proc,
+      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>
+          *origin_result_ptr)
+      : workSpaceSize(workSpaceSize),
+        allowTensorOpMathConversion(allowTensorOpMathConversion),
+        origin_proc(origin_proc), origin_result_ptr(origin_result_ptr) {}
+
+public:
+  size_t workSpaceSize;
+  bool allowTensorOpMathConversion;
+  Realm::Processor origin_proc;
+  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> *origin_result_ptr;
+};
+static_assert(std::is_trivially_copy_constructible_v<DeviceHandleInitTaskArgs>);
+
+static std::optional<ManagedPerDeviceFFHandle *>
+    make_device_handle_for_processor(Realm::Processor processor,
+                                     size_t workSpaceSize,
+                                     bool allowTensorOpMathConversion) {
+  switch (processor.kind()) {
+    case Realm::Processor::LOC_PROC:
+      return std::nullopt;
+    case Realm::Processor::TOC_PROC:
+      return new ManagedPerDeviceFFHandle{initialize_multi_gpu_handle(
+          /*num_ranks=*/Realm::Machine::get_machine().get_address_space_count(),
+          /*my_rank=*/processor.address_space(),
+          /*workSpaceSize=*/workSpaceSize,
+          /*allowTensorOpMathConversion=*/allowTensorOpMathConversion)};
+    default:
+      PANIC("Unhandled Realm::ProcessorKind",
+            fmt::to_string(int{processor.kind()}));
+  }
+}
+
+void device_handle_init_task_body(void const *args,
+                                  size_t arglen,
+                                  void const *userdata,
+                                  size_t userlen,
+                                  Realm::Processor proc) {
+  ASSERT(arglen == sizeof(DeviceHandleInitTaskArgs));
+  DeviceHandleInitTaskArgs task_args =
+      *reinterpret_cast<DeviceHandleInitTaskArgs const *>(args);
+
+  // FIXME: serialize instead of passing pointers around
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+
+  RealmContext ctx{proc};
+  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> managed_handle =
+      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>::create(
+          ctx.get_current_device_idx(),
+          make_device_handle_for_processor(
+              proc,
+              task_args.workSpaceSize,
+              task_args.allowTensorOpMathConversion));
+
+  spawn_device_handle_init_return_task(ctx,
+                                       task_args.origin_proc,
+                                       managed_handle,
+                                       task_args.origin_result_ptr,
+                                       Realm::Event::NO_EVENT);
+}
+
+Realm::Event spawn_device_handle_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> *result_ptr,
+    Realm::Event precondition) {
+  DeviceHandleInitTaskArgs task_args{
+      workSpaceSize,
+      allowTensorOpMathConversion,
+      ctx.get_current_processor(),
+      result_ptr,
+  };
+
+  return ctx.spawn_task(target_proc,
+                        task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
deleted file mode 100644
index 610500a94b..0000000000
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_return_task.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "realm-execution/tasks/impl/device_init_task.h"
-#include "realm-execution/tasks/task_id_t.dtg.h"
-
-namespace FlexFlow {
-
-// FIXME: Can't make this trivially copyable?
-struct DeviceInitReturnTaskArgs {
-public:
-  DeviceInitReturnTaskArgs() = delete;
-  DeviceInitReturnTaskArgs(DeviceSpecificPerDeviceOpState result,
-                           Realm::Processor origin_proc,
-                           DeviceSpecificPerDeviceOpState *origin_result_ptr)
-      : result(result), origin_proc(origin_proc),
-        origin_result_ptr(origin_result_ptr) {}
-
-public:
-  DeviceSpecificPerDeviceOpState result;
-  Realm::Processor origin_proc;
-  DeviceSpecificPerDeviceOpState *origin_result_ptr;
-};
-
-void device_init_return_task_body(void const *args,
-                                  size_t arglen,
-                                  void const *userdata,
-                                  size_t userlen,
-                                  Realm::Processor proc) {
-  ASSERT(arglen == sizeof(DeviceInitReturnTaskArgs));
-  DeviceInitReturnTaskArgs task_args =
-      *reinterpret_cast<DeviceInitReturnTaskArgs const *>(args);
-
-  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
-  *task_args.origin_result_ptr = task_args.result;
-}
-
-Realm::Event spawn_device_init_return_task(
-    RealmContext &ctx,
-    Realm::Processor origin_proc,
-    DeviceSpecificPerDeviceOpState const &result,
-    DeviceSpecificPerDeviceOpState *origin_result_ptr,
-    Realm::Event precondition) {
-  DeviceInitReturnTaskArgs task_args{result, origin_proc, origin_result_ptr};
-
-  return ctx.spawn_task(origin_proc,
-                        task_id_t::DEVICE_INIT_RETURN_TASK_ID,
-                        &task_args,
-                        sizeof(task_args),
-                        Realm::ProfilingRequestSet{},
-                        precondition);
-}
-
-} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
new file mode 100644
index 0000000000..c1bd7c1081
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
@@ -0,0 +1,53 @@
+#include "realm-execution/tasks/impl/device_state_init_return_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+// FIXME: Can't make this trivially copyable?
+struct DeviceStateInitReturnTaskArgs {
+public:
+  DeviceStateInitReturnTaskArgs() = delete;
+  DeviceStateInitReturnTaskArgs(
+      DeviceSpecificPerDeviceOpState result,
+      Realm::Processor origin_proc,
+      DeviceSpecificPerDeviceOpState *origin_result_ptr)
+      : result(result), origin_proc(origin_proc),
+        origin_result_ptr(origin_result_ptr) {}
+
+public:
+  DeviceSpecificPerDeviceOpState result;
+  Realm::Processor origin_proc;
+  DeviceSpecificPerDeviceOpState *origin_result_ptr;
+};
+
+void device_state_init_return_task_body(void const *args,
+                                        size_t arglen,
+                                        void const *userdata,
+                                        size_t userlen,
+                                        Realm::Processor proc) {
+  ASSERT(arglen == sizeof(DeviceStateInitReturnTaskArgs));
+  DeviceStateInitReturnTaskArgs task_args =
+      *reinterpret_cast<DeviceStateInitReturnTaskArgs const *>(args);
+
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+  *task_args.origin_result_ptr = task_args.result;
+}
+
+Realm::Event spawn_device_state_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificPerDeviceOpState const &result,
+    DeviceSpecificPerDeviceOpState *origin_result_ptr,
+    Realm::Event precondition) {
+  DeviceStateInitReturnTaskArgs task_args{
+      result, origin_proc, origin_result_ptr};
+
+  return ctx.spawn_task(origin_proc,
+                        task_id_t::DEVICE_STATE_INIT_RETURN_TASK_ID,
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
similarity index 58%
rename from lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
rename to lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
index 7f36f48921..f63efba14b 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -1,6 +1,6 @@
-#include "realm-execution/tasks/impl/device_init_task.h"
+#include "realm-execution/tasks/impl/device_state_init_task.h"
 #include "local-execution/device_state_initialization.h"
-#include "realm-execution/tasks/impl/device_init_return_task.h"
+#include "realm-execution/tasks/impl/device_state_init_return_task.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
@@ -14,14 +14,14 @@ namespace FlexFlow {
 // TODO: at some point we're going to have to actually serialize these, but for
 // now just pass the pointer and assume we're running inside a single address
 // space
-struct DeviceInitTaskArgs {
-  DeviceInitTaskArgs() = delete;
-  DeviceInitTaskArgs(DynamicNodeInvocation const *invocation,
-                     ProfilingSettings const *profiling_settings,
-                     FFIterationConfig const *iteration_config,
-                     OptimizerAttrs const *optimizer_attrs,
-                     Realm::Processor origin_proc,
-                     DeviceSpecificPerDeviceOpState *origin_result_ptr)
+struct DeviceStateInitTaskArgs {
+  DeviceStateInitTaskArgs() = delete;
+  DeviceStateInitTaskArgs(DynamicNodeInvocation const *invocation,
+                          ProfilingSettings const *profiling_settings,
+                          FFIterationConfig const *iteration_config,
+                          OptimizerAttrs const *optimizer_attrs,
+                          Realm::Processor origin_proc,
+                          DeviceSpecificPerDeviceOpState *origin_result_ptr)
       : invocation(invocation), profiling_settings(profiling_settings),
         iteration_config(iteration_config), optimizer_attrs(optimizer_attrs),
         origin_proc(origin_proc), origin_result_ptr(origin_result_ptr) {}
@@ -34,16 +34,17 @@ struct DeviceInitTaskArgs {
   Realm::Processor origin_proc;
   DeviceSpecificPerDeviceOpState *origin_result_ptr;
 };
-static_assert(std::has_unique_object_representations_v<DeviceInitTaskArgs>);
+static_assert(
+    std::has_unique_object_representations_v<DeviceStateInitTaskArgs>);
 
-void device_init_task_body(void const *args,
-                           size_t arglen,
-                           void const *userdata,
-                           size_t userlen,
-                           Realm::Processor proc) {
-  ASSERT(arglen == sizeof(DeviceInitTaskArgs));
-  DeviceInitTaskArgs task_args =
-      *reinterpret_cast<DeviceInitTaskArgs const *>(args);
+void device_state_init_task_body(void const *args,
+                                 size_t arglen,
+                                 void const *userdata,
+                                 size_t userlen,
+                                 Realm::Processor proc) {
+  ASSERT(arglen == sizeof(DeviceStateInitTaskArgs));
+  DeviceStateInitTaskArgs task_args =
+      *reinterpret_cast<DeviceStateInitTaskArgs const *>(args);
 
   // FIXME: serialize instead of passing pointers around
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
@@ -63,23 +64,23 @@ void device_init_task_body(void const *args,
   // the allocation here
   DeviceSpecificPerDeviceOpState *result_state_ptr =
       new DeviceSpecificPerDeviceOpState{result_state};
-  spawn_device_init_return_task(ctx,
-                                task_args.origin_proc,
-                                *result_state_ptr,
-                                task_args.origin_result_ptr,
-                                Realm::Event::NO_EVENT);
+  spawn_device_state_init_return_task(ctx,
+                                      task_args.origin_proc,
+                                      *result_state_ptr,
+                                      task_args.origin_result_ptr,
+                                      Realm::Event::NO_EVENT);
 }
 
 std::optional<Realm::Event>
-    spawn_device_init_task(RealmContext &ctx,
-                           Realm::Processor &target_proc,
-                           DynamicNodeInvocation const &invocation,
-                           ProfilingSettings const &profiling_settings,
-                           FFIterationConfig const &iteration_config,
-                           OptimizerAttrs const &optimizer_attrs,
-                           DeviceSpecificPerDeviceOpState *result_ptr,
-                           Realm::Event precondition) {
-  DeviceInitTaskArgs task_args{
+    spawn_device_state_init_task(RealmContext &ctx,
+                                 Realm::Processor target_proc,
+                                 DynamicNodeInvocation const &invocation,
+                                 ProfilingSettings const &profiling_settings,
+                                 FFIterationConfig const &iteration_config,
+                                 OptimizerAttrs const &optimizer_attrs,
+                                 DeviceSpecificPerDeviceOpState *result_ptr,
+                                 Realm::Event precondition) {
+  DeviceStateInitTaskArgs task_args{
       &invocation,
       &profiling_settings,
       &iteration_config,
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index c63d4727a9..9150ce6892 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -1,7 +1,7 @@
 #include "realm-execution/tasks/realm_task_registry.h"
 #include "realm-execution/tasks/impl/controller_task.h"
-#include "realm-execution/tasks/impl/device_init_return_task.h"
-#include "realm-execution/tasks/impl/device_init_task.h"
+#include "realm-execution/tasks/impl/device_state_init_return_task.h"
+#include "realm-execution/tasks/impl/device_state_init_task.h"
 #include "realm-execution/tasks/impl/op_task.h"
 #include "realm-execution/tasks/realm_task_id_t.h"
 #include "utils/exception.h"
@@ -48,7 +48,7 @@ Realm::Event register_all_tasks() {
 
   for (task_id_t task_id : init_task_ids) {
     pending_registrations.push_back(register_task(
-        Realm::Processor::TOC_PROC, task_id, device_init_task_body));
+        Realm::Processor::TOC_PROC, task_id, device_state_init_task_body));
   }
 
   std::vector<task_id_t> task_ids = {
@@ -127,8 +127,8 @@ Realm::Event register_all_tasks() {
                                                 controller_task_body));
   pending_registrations.push_back(
       register_task(Realm::Processor::LOC_PROC,
-                    task_id_t::DEVICE_INIT_RETURN_TASK_ID,
-                    device_init_return_task_body));
+                    task_id_t::DEVICE_STATE_INIT_RETURN_TASK_ID,
+                    device_state_init_return_task_body));
   return Realm::Event::merge_events(pending_registrations);
 }
 

From d741612badaa6edeb77211c011fd90d453e8fb84 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 16:45:58 -0800
Subject: [PATCH 047/113] Distributed device handle initialization.

---
 lib/kernels/include/kernels/device_handle_t.h |  3 ++
 lib/kernels/src/kernels/device_handle_t.cc    |  9 ++++
 ...ific_managed_per_device_ff_handle.dtg.toml | 16 ++++++
 ...ce_specific_managed_per_device_ff_handle.h | 19 +++++++
 .../distributed_device_handle.h               | 16 +++---
 .../distributed_device_state_initialization.h |  2 +
 .../include/realm-execution/fmt/instance.h    |  4 +-
 .../include/realm-execution/hash/processor.h  | 16 ++++++
 .../pcg_instance/pcg_instance.h               |  2 +
 .../include/realm-execution/realm_context.h   |  3 --
 .../impl/device_handle_init_return_task.h     |  8 ++-
 .../tasks/impl/device_handle_init_task.h      |  5 +-
 .../tasks/impl/device_state_init_task.h       | 20 ++++----
 .../realm-execution/tasks/impl/op_task.h      | 17 ++++---
 ...e_specific_managed_per_device_ff_handle.cc | 21 ++++++++
 .../distributed_device_handle.cc              | 17 +++----
 ...distributed_device_state_initialization.cc |  2 +
 .../src/realm-execution/hash/processor.cc     | 11 +++++
 .../pcg_instance/pcg_instance.cc              | 30 ++++++++----
 .../src/realm-execution/realm_context.cc      | 30 +-----------
 .../impl/device_handle_init_return_task.cc    | 15 +++---
 .../tasks/impl/device_handle_init_task.cc     | 12 ++---
 .../impl/device_state_init_return_task.cc     |  1 -
 .../tasks/impl/device_state_init_task.cc      | 49 +++++++++++--------
 .../src/realm-execution/tasks/impl/op_task.cc | 29 +++++++----
 25 files changed, 224 insertions(+), 133 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
 create mode 100644 lib/realm-execution/include/realm-execution/hash/processor.h
 create mode 100644 lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
 create mode 100644 lib/realm-execution/src/realm-execution/hash/processor.cc

diff --git a/lib/kernels/include/kernels/device_handle_t.h b/lib/kernels/include/kernels/device_handle_t.h
index 9b7769355e..0836503717 100644
--- a/lib/kernels/include/kernels/device_handle_t.h
+++ b/lib/kernels/include/kernels/device_handle_t.h
@@ -9,6 +9,9 @@ namespace FlexFlow {
 device_handle_t device_handle_t_from_managed_handle(
     std::optional<ManagedPerDeviceFFHandle> const &managed_handle);
 
+device_handle_t device_handle_t_from_managed_handle_ptr(
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle);
+
 device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle);
 device_handle_t cpu_make_device_handle_t();
 
diff --git a/lib/kernels/src/kernels/device_handle_t.cc b/lib/kernels/src/kernels/device_handle_t.cc
index 85f9e2a388..0225ee8e94 100644
--- a/lib/kernels/src/kernels/device_handle_t.cc
+++ b/lib/kernels/src/kernels/device_handle_t.cc
@@ -11,6 +11,15 @@ device_handle_t device_handle_t_from_managed_handle(
   }
 }
 
+device_handle_t device_handle_t_from_managed_handle_ptr(
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
+  if (managed_handle.has_value()) {
+    return gpu_make_device_handle_t(managed_handle.value()->raw_handle());
+  } else {
+    return cpu_make_device_handle_t();
+  }
+}
+
 device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle) {
   return device_handle_t{
       ff_handle,
diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.dtg.toml b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.dtg.toml
new file mode 100644
index 0000000000..1458adcba3
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.dtg.toml
@@ -0,0 +1,16 @@
+namespace = "FlexFlow"
+name = "DeviceSpecificManagedPerDeviceFFHandle"
+type = "struct"
+features = [
+  "eq",
+]
+
+includes = [
+  "<optional>",
+  "kernels/managed_per_device_ff_handle.h",
+  "task-spec/device_specific.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::ManagedPerDeviceFFHandle *>>"
diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
new file mode 100644
index 0000000000..eefa6c86ac
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_MANAGED_PER_DEVICE_FF_HANDLE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_MANAGED_PER_DEVICE_FF_HANDLE_H
+
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "pcg/device_id_t.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
+
+namespace FlexFlow {
+
+DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
+    device_id_t const &, std::optional<ManagedPerDeviceFFHandle *> const &);
+
+device_handle_t device_handle_t_from_device_specific_managed_handle(
+    DeviceSpecificManagedPerDeviceFFHandle const &, device_id_t);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
index ca3f08fc41..3f55c47192 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
@@ -1,12 +1,11 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
 
-#include "kernels/managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
+#include "realm-execution/hash/processor.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
-#include "task-spec/device_specific.h"
-#include <map>
-#include <optional>
+#include <unordered_map>
 
 namespace FlexFlow {
 
@@ -14,17 +13,14 @@ struct DistributedDeviceHandle {
 public:
   DistributedDeviceHandle() = delete;
   explicit DistributedDeviceHandle(
-      std::map<Realm::Processor,
-               DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>> const
+      std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle> const
           &handles);
 
-  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> const &
+  DeviceSpecificManagedPerDeviceFFHandle const &
       at(Realm::Processor processor) const;
 
 private:
-  std::map<Realm::Processor,
-           DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>>
-      handles;
+  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle> handles;
 };
 
 DistributedDeviceHandle create_distributed_device_handle(
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
index 5530f473d8..ca24ecdd4c 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
@@ -3,6 +3,7 @@
 
 #include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/distributed_device_handle.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/ff_iteration_config.dtg.h"
@@ -13,6 +14,7 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
     DynamicOpenDataflowGraph const &dg,
     RealmContext &ctx,
     ProfilingSettings const &profiling_settings,
+    DistributedDeviceHandle const &device_handle,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs,
     Realm::Event precondition);
diff --git a/lib/realm-execution/include/realm-execution/fmt/instance.h b/lib/realm-execution/include/realm-execution/fmt/instance.h
index b2efc59b7d..c7c2df6735 100644
--- a/lib/realm-execution/include/realm-execution/fmt/instance.h
+++ b/lib/realm-execution/include/realm-execution/fmt/instance.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_PAIR_H
-#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_PAIR_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
 
 #include "realm-execution/realm.h"
 #include "utils/check_fmtable.h"
diff --git a/lib/realm-execution/include/realm-execution/hash/processor.h b/lib/realm-execution/include/realm-execution/hash/processor.h
new file mode 100644
index 0000000000..e5eb8eb503
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/hash/processor.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_HASH_PROCESSOR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_HASH_PROCESSOR_H
+
+#include "realm-execution/realm.h"
+#include <utility>
+
+namespace std {
+
+template <>
+struct hash<::FlexFlow::Realm::Processor> {
+  size_t operator()(::FlexFlow::Realm::Processor const &p) const;
+};
+
+} // namespace std
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
index 3c5b4189ea..b917477df4 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -10,6 +10,7 @@
 #include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_device_handle.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
@@ -53,6 +54,7 @@ PCGInstance create_pcg_instance(
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
+    DistributedDeviceHandle const &device_handle,
     FFIterationConfig const &iteration_config);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index eb4d6d0935..b8baad41b9 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -30,7 +30,6 @@ struct RealmContext {
   // Current device context
   Realm::Processor get_current_processor() const;
   Allocator &get_current_device_allocator();
-  device_handle_t const &get_current_device_handle() const;
   device_id_t get_current_device_idx() const;
 
   // Task creation
@@ -74,8 +73,6 @@ struct RealmContext {
   Realm::Runtime runtime;
   Realm::Processor processor;
   Allocator allocator;
-  std::optional<ManagedPerDeviceFFHandle> managed_handle;
-  device_handle_t device_handle;
   std::vector<Realm::Event> outstanding_events;
   std::unordered_map<std::pair<Realm::AddressSpace, Realm::Processor::Kind>,
                      std::vector<Realm::Processor>>
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
index 8b358ee4ce..9bae546403 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
@@ -1,10 +1,9 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
 
-#include "kernels/managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
-#include "task-spec/device_specific_per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
@@ -14,9 +13,8 @@ void device_handle_init_return_task_body(
 Realm::Event spawn_device_handle_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
-    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> const &result,
-    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>
-        *origin_result_ptr,
+    DeviceSpecificManagedPerDeviceFFHandle const &result,
+    DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
     Realm::Event precondition);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
index c26633bd9a..624eb6e682 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
@@ -1,10 +1,9 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
 
-#include "kernels/managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
-#include "task-spec/device_specific_per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
@@ -16,7 +15,7 @@ Realm::Event spawn_device_handle_init_task(
     Realm::Processor target_proc,
     size_t workSpaceSize,
     bool allowTensorOpMathConversion,
-    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> *result_ptr,
+    DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
     Realm::Event precondition);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
index 4cd65a0a2a..933d4f9283 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
@@ -3,6 +3,7 @@
 
 #include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
@@ -14,15 +15,16 @@ namespace FlexFlow {
 void device_state_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
-std::optional<Realm::Event>
-    spawn_device_state_init_task(RealmContext &ctx,
-                                 Realm::Processor target_proc,
-                                 DynamicNodeInvocation const &invocation,
-                                 ProfilingSettings const &profiling_settings,
-                                 FFIterationConfig const &iteration_config,
-                                 OptimizerAttrs const &optimizer_attrs,
-                                 DeviceSpecificPerDeviceOpState *result_ptr,
-                                 Realm::Event precondition);
+std::optional<Realm::Event> spawn_device_state_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    DeviceSpecificPerDeviceOpState *result_ptr,
+    Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 21d8795339..847154192a 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -4,6 +4,7 @@
 #include "kernels/profiling_settings.dtg.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
@@ -13,13 +14,15 @@ namespace FlexFlow {
 
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event spawn_op_task(RealmContext &ctx,
-                           Realm::Processor target_proc,
-                           DynamicNodeInvocation const &invocation,
-                           ProfilingSettings const &profiling_settings,
-                           FFIterationConfig const &iteration_config,
-                           std::optional<OptimizerAttrs> const &optimizer_attrs,
-                           Realm::Event precondition);
+Realm::Event
+    spawn_op_task(RealmContext &ctx,
+                  Realm::Processor target_proc,
+                  DynamicNodeInvocation const &invocation,
+                  ProfilingSettings const &profiling_settings,
+                  DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+                  FFIterationConfig const &iteration_config,
+                  std::optional<OptimizerAttrs> const &optimizer_attrs,
+                  Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
new file mode 100644
index 0000000000..440b9d18f7
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -0,0 +1,21 @@
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "kernels/device_handle_t.h"
+
+namespace FlexFlow {
+
+DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
+    device_id_t const &device_id,
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
+  return DeviceSpecificManagedPerDeviceFFHandle{
+      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>::create(
+          device_id, managed_handle)};
+}
+
+device_handle_t device_handle_t_from_device_specific_managed_handle(
+    DeviceSpecificManagedPerDeviceFFHandle const &device_specific,
+    device_id_t device_idx) {
+  return device_handle_t_from_managed_handle_ptr(
+      *device_specific.handle.get(device_idx));
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
index 00c2e76360..404feb014c 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
@@ -1,16 +1,16 @@
 #include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/tasks/impl/device_handle_init_task.h"
 #include "task-spec/device_specific.h"
 
 namespace FlexFlow {
 
 DistributedDeviceHandle::DistributedDeviceHandle(
-    std::map<Realm::Processor,
-             DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>> const
+    std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle> const
         &handles)
     : handles(handles) {}
 
-DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> const &
+DeviceSpecificManagedPerDeviceFFHandle const &
     DistributedDeviceHandle::at(Realm::Processor processor) const {
   return this->handles.at(processor);
 }
@@ -20,17 +20,14 @@ DistributedDeviceHandle
                                      size_t workSpaceSize,
                                      bool allowTensorOpMathConversion,
                                      Realm::Event precondition) {
-  std::map<Realm::Processor,
-           DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>>
-      handles;
+  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle> handles;
 
   // Allocate space for the result before launching any tasks
   Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
   for (Realm::Processor proc : pq) {
-    handles.insert(
-        {proc,
-         DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>::create(
-             ctx.get_current_device_idx(), std::nullopt)});
+    handles.insert({proc,
+                    make_device_specific_managed_handle(
+                        ctx.get_current_device_idx(), std::nullopt)});
   }
 
   for (auto &[proc, handle] : handles) {
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
index 9627a71e87..cab2b49e15 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -13,6 +13,7 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
     DynamicOpenDataflowGraph const &dg,
     RealmContext &ctx,
     ProfilingSettings const &profiling_settings,
+    DistributedDeviceHandle const &device_handle,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs,
     Realm::Event precondition) {
@@ -37,6 +38,7 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
                                      target_proc,
                                      invocation,
                                      profiling_settings,
+                                     device_handle.at(target_proc),
                                      iteration_config,
                                      optimizer_attrs,
                                      output,
diff --git a/lib/realm-execution/src/realm-execution/hash/processor.cc b/lib/realm-execution/src/realm-execution/hash/processor.cc
new file mode 100644
index 0000000000..dcc1bc5d06
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/hash/processor.cc
@@ -0,0 +1,11 @@
+#include "realm-execution/hash/processor.h"
+#include <utility>
+
+namespace std {
+
+size_t hash<::FlexFlow::Realm::Processor>::operator()(
+    ::FlexFlow::Realm::Processor const &p) const {
+  return hash<::FlexFlow::Realm::Processor::id_t>{}(p.id);
+}
+
+} // namespace std
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index d56dbb9ca9..c79d8e8abd 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -57,6 +57,7 @@ PCGInstance create_parallel_computation_graph_instance(
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
+    DistributedDeviceHandle const &device_handle,
     FFIterationConfig const &iteration_config) {
 
   DynamicOpenDataflowGraph dg =
@@ -91,6 +92,7 @@ PCGInstance create_parallel_computation_graph_instance(
       dg,
       ctx,
       profiling_settings,
+      device_handle,
       iteration_config,
       optimizer_attrs,
       ctx.get_outstanding_events());
@@ -117,6 +119,7 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         std::vector<DynamicNodeInvocation> const &invocations,
         OptimizerAttrs const &optimizer_attrs,
         ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   // For simplicity we'll track a dependency on all outstanding operations up to
   // this point. This will create an effective barrier between phases.
@@ -136,15 +139,16 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         Realm::Event dependencies = Realm::Event::merge_events(
             Realm::Event::merge_events(input_dependencies),
             Realm::Event::merge_events(output_dependencies));
-        Realm::Event result =
-            spawn_op_task(ctx,
-                          ctx.map_device_coord_to_processor(assert_unwrap(
-                              invocation.node_attrs.device_coord)),
-                          invocation,
-                          profiling_settings,
-                          iteration_config,
-                          optimizer_attrs,
-                          dependencies);
+        Realm::Processor target_proc = ctx.map_device_coord_to_processor(
+            assert_unwrap(invocation.node_attrs.device_coord));
+        Realm::Event result = spawn_op_task(ctx,
+                                            target_proc,
+                                            invocation,
+                                            profiling_settings,
+                                            device_handle.at(target_proc),
+                                            iteration_config,
+                                            optimizer_attrs,
+                                            dependencies);
         for (DynamicValueAttrs const &value : values(invocation.inputs)) {
           dependency_set.add_reader(value, result);
         }
@@ -159,6 +163,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_all_passes_for_parallel_computation_graph_instance(
         PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
       instance.get_execution_order();
@@ -168,6 +173,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
           /*invocations=*/execution_order,
           /*optimizer_attrs=*/instance.get_optimizer_attrs(),
           /*profiling_settings=*/profiling_settings,
+          /*device_handle=*/device_handle,
           /*iteration_config=*/iteration_config);
   instance.update_optimizer_attrs_for_next_iter();
   return result;
@@ -177,6 +183,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_forward_pass_for_parallel_computation_graph_instance(
         PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
       filter(instance.get_execution_order(),
@@ -191,6 +198,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
       /*invocations=*/execution_order,
       /*optimizer_attrs=*/instance.get_optimizer_attrs(),
       /*profiling_settings=*/profiling_settings,
+      /*device_handle=*/device_handle,
       /*iteration_config=*/iteration_config);
 }
 
@@ -198,6 +206,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_backward_pass_for_parallel_computation_graph_instance(
         PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
       filter(instance.get_execution_order(),
@@ -212,6 +221,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
       /*invocations=*/execution_order,
       /*optimizer_attrs=*/instance.get_optimizer_attrs(),
       /*profiling_settings=*/profiling_settings,
+      /*device_handle=*/device_handle,
       /*iteration_config=*/iteration_config);
 }
 
@@ -219,6 +229,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_update_pass_for_parallel_computation_graph_instance(
         PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
       filter(instance.get_execution_order(),
@@ -234,6 +245,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
           /*invocations=*/execution_order,
           /*optimizer_attrs=*/instance.get_optimizer_attrs(),
           /*profiling_settings=*/profiling_settings,
+          /*device_handle=*/device_handle,
           /*iteration_config=*/iteration_config);
   instance.update_optimizer_attrs_for_next_iter();
   return result;
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 38ce052da9..3427e8cbee 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -19,9 +19,7 @@ namespace FlexFlow {
 RealmContext::RealmContext(Realm::Processor processor)
     : processor(processor),
       allocator(get_realm_allocator(
-          processor, RealmContext::get_nearest_memory(processor))),
-      managed_handle(RealmContext::make_device_handle_for_processor(processor)),
-      device_handle(device_handle_t_from_managed_handle(managed_handle)) {}
+          processor, RealmContext::get_nearest_memory(processor))) {}
 
 RealmContext::~RealmContext() {
   if (!this->outstanding_events.empty()) {
@@ -79,10 +77,6 @@ Allocator &RealmContext::get_current_device_allocator() {
   return this->allocator;
 }
 
-device_handle_t const &RealmContext::get_current_device_handle() const {
-  return this->device_handle;
-}
-
 device_id_t RealmContext::get_current_device_idx() const {
   Realm::Processor proc = this->get_current_processor();
 
@@ -255,26 +249,4 @@ void RealmContext::discover_machine_topology() {
   }
 }
 
-std::optional<ManagedPerDeviceFFHandle>
-    RealmContext::make_device_handle_for_processor(Realm::Processor processor) {
-  if (!processor.exists()) {
-    return std::nullopt;
-  }
-
-  switch (processor.kind()) {
-    case Realm::Processor::LOC_PROC:
-      return std::nullopt;
-    case Realm::Processor::TOC_PROC:
-      // FIXME: not sure what workSpaceSize to choose here
-      return initialize_multi_gpu_handle(
-          /*num_ranks=*/Realm::Machine::get_machine().get_address_space_count(),
-          /*my_rank=*/processor.address_space(),
-          /*workSpaceSize=*/1024 * 1024,
-          /*allowTensorOpMathConversion=*/true);
-    default:
-      PANIC("Unhandled Realm::ProcessorKind",
-            fmt::to_string(int{processor.kind()}));
-  }
-}
-
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
index 2839beef0c..bda6f7781c 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
@@ -3,22 +3,20 @@
 
 namespace FlexFlow {
 
-// FIXME: Can't make this trivially copyable?
 struct DeviceHandleInitReturnTaskArgs {
 public:
   DeviceHandleInitReturnTaskArgs() = delete;
   DeviceHandleInitReturnTaskArgs(
-      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> result,
+      DeviceSpecificManagedPerDeviceFFHandle result,
       Realm::Processor origin_proc,
-      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>
-          *origin_result_ptr)
+      DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr)
       : result(result), origin_proc(origin_proc),
         origin_result_ptr(origin_result_ptr) {}
 
 public:
-  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> result;
+  DeviceSpecificManagedPerDeviceFFHandle result;
   Realm::Processor origin_proc;
-  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> *origin_result_ptr;
+  DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr;
 };
 
 void device_handle_init_return_task_body(void const *args,
@@ -37,9 +35,8 @@ void device_handle_init_return_task_body(void const *args,
 Realm::Event spawn_device_handle_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
-    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> const &result,
-    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>
-        *origin_result_ptr,
+    DeviceSpecificManagedPerDeviceFFHandle const &result,
+    DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
     Realm::Event precondition) {
   DeviceHandleInitReturnTaskArgs task_args{
       result, origin_proc, origin_result_ptr};
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
index 86a576d26b..cd5608ca7e 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/tasks/impl/device_handle_init_return_task.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include <type_traits>
@@ -14,8 +15,7 @@ struct DeviceHandleInitTaskArgs {
       size_t workSpaceSize,
       bool allowTensorOpMathConversion,
       Realm::Processor origin_proc,
-      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>
-          *origin_result_ptr)
+      DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr)
       : workSpaceSize(workSpaceSize),
         allowTensorOpMathConversion(allowTensorOpMathConversion),
         origin_proc(origin_proc), origin_result_ptr(origin_result_ptr) {}
@@ -24,7 +24,7 @@ struct DeviceHandleInitTaskArgs {
   size_t workSpaceSize;
   bool allowTensorOpMathConversion;
   Realm::Processor origin_proc;
-  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> *origin_result_ptr;
+  DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr;
 };
 static_assert(std::is_trivially_copy_constructible_v<DeviceHandleInitTaskArgs>);
 
@@ -60,8 +60,8 @@ void device_handle_init_task_body(void const *args,
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
 
   RealmContext ctx{proc};
-  DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> managed_handle =
-      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>::create(
+  DeviceSpecificManagedPerDeviceFFHandle managed_handle =
+      make_device_specific_managed_handle(
           ctx.get_current_device_idx(),
           make_device_handle_for_processor(
               proc,
@@ -80,7 +80,7 @@ Realm::Event spawn_device_handle_init_task(
     Realm::Processor target_proc,
     size_t workSpaceSize,
     bool allowTensorOpMathConversion,
-    DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>> *result_ptr,
+    DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
     Realm::Event precondition) {
   DeviceHandleInitTaskArgs task_args{
       workSpaceSize,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
index c1bd7c1081..306697e950 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
@@ -3,7 +3,6 @@
 
 namespace FlexFlow {
 
-// FIXME: Can't make this trivially copyable?
 struct DeviceStateInitReturnTaskArgs {
 public:
   DeviceStateInitReturnTaskArgs() = delete;
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
index f63efba14b..5a51b1c803 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -1,5 +1,7 @@
 #include "realm-execution/tasks/impl/device_state_init_task.h"
+#include "kernels/device_handle_t.dtg.h"
 #include "local-execution/device_state_initialization.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/tasks/impl/device_state_init_return_task.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
@@ -16,26 +18,28 @@ namespace FlexFlow {
 // space
 struct DeviceStateInitTaskArgs {
   DeviceStateInitTaskArgs() = delete;
-  DeviceStateInitTaskArgs(DynamicNodeInvocation const *invocation,
-                          ProfilingSettings const *profiling_settings,
-                          FFIterationConfig const *iteration_config,
-                          OptimizerAttrs const *optimizer_attrs,
-                          Realm::Processor origin_proc,
-                          DeviceSpecificPerDeviceOpState *origin_result_ptr)
+  DeviceStateInitTaskArgs(
+      DynamicNodeInvocation const *invocation,
+      ProfilingSettings const *profiling_settings,
+      DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+      FFIterationConfig const *iteration_config,
+      OptimizerAttrs const *optimizer_attrs,
+      Realm::Processor origin_proc,
+      DeviceSpecificPerDeviceOpState *origin_result_ptr)
       : invocation(invocation), profiling_settings(profiling_settings),
-        iteration_config(iteration_config), optimizer_attrs(optimizer_attrs),
-        origin_proc(origin_proc), origin_result_ptr(origin_result_ptr) {}
+        device_handle(device_handle), iteration_config(iteration_config),
+        optimizer_attrs(optimizer_attrs), origin_proc(origin_proc),
+        origin_result_ptr(origin_result_ptr) {}
 
 public:
   DynamicNodeInvocation const *invocation;
   ProfilingSettings const *profiling_settings;
+  DeviceSpecificManagedPerDeviceFFHandle device_handle;
   FFIterationConfig const *iteration_config;
   OptimizerAttrs const *optimizer_attrs;
   Realm::Processor origin_proc;
   DeviceSpecificPerDeviceOpState *origin_result_ptr;
 };
-static_assert(
-    std::has_unique_object_representations_v<DeviceStateInitTaskArgs>);
 
 void device_state_init_task_body(void const *args,
                                  size_t arglen,
@@ -50,11 +54,14 @@ void device_state_init_task_body(void const *args,
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
 
   RealmContext ctx{proc};
+  device_handle_t device_handle =
+      device_handle_t_from_device_specific_managed_handle(
+          task_args.device_handle, ctx.get_current_device_idx());
   DynamicNodeInvocation result_invocation =
       initialize_node(*task_args.invocation,
                       ctx.get_current_device_allocator(),
                       *task_args.profiling_settings,
-                      ctx.get_current_device_handle(),
+                      device_handle,
                       *task_args.iteration_config,
                       *task_args.optimizer_attrs,
                       ctx.get_current_device_idx());
@@ -71,18 +78,20 @@ void device_state_init_task_body(void const *args,
                                       Realm::Event::NO_EVENT);
 }
 
-std::optional<Realm::Event>
-    spawn_device_state_init_task(RealmContext &ctx,
-                                 Realm::Processor target_proc,
-                                 DynamicNodeInvocation const &invocation,
-                                 ProfilingSettings const &profiling_settings,
-                                 FFIterationConfig const &iteration_config,
-                                 OptimizerAttrs const &optimizer_attrs,
-                                 DeviceSpecificPerDeviceOpState *result_ptr,
-                                 Realm::Event precondition) {
+std::optional<Realm::Event> spawn_device_state_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    DeviceSpecificPerDeviceOpState *result_ptr,
+    Realm::Event precondition) {
   DeviceStateInitTaskArgs task_args{
       &invocation,
       &profiling_settings,
+      device_handle,
       &iteration_config,
       &optimizer_attrs,
       ctx.get_current_processor(),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index 216f0badde..e17973febb 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -1,5 +1,6 @@
 #include "realm-execution/tasks/impl/op_task.h"
 #include "local-execution/task_execution.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "task-spec/per_device_op_state.h"
 #include "utils/optional.h"
@@ -15,20 +16,22 @@ struct OpTaskArgs {
   OpTaskArgs() = delete;
   OpTaskArgs(DynamicNodeInvocation const *invocation,
              ProfilingSettings const *profiling_settings,
+             DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
              FFIterationConfig const *iteration_config,
              std::optional<OptimizerAttrs> const *optimizer_attrs,
              Realm::Processor origin_proc)
       : invocation(invocation), profiling_settings(profiling_settings),
-        iteration_config(iteration_config), optimizer_attrs(optimizer_attrs) {}
+        device_handle(device_handle), iteration_config(iteration_config),
+        optimizer_attrs(optimizer_attrs) {}
 
 public:
   DynamicNodeInvocation const *invocation;
   ProfilingSettings const *profiling_settings;
+  DeviceSpecificManagedPerDeviceFFHandle device_handle;
   FFIterationConfig const *iteration_config;
   std::optional<OptimizerAttrs> const *optimizer_attrs;
   Realm::Processor origin_proc;
 };
-static_assert(std::has_unique_object_representations_v<OpTaskArgs>);
 
 void op_task_body(void const *args,
                   size_t arglen,
@@ -42,11 +45,14 @@ void op_task_body(void const *args,
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
 
   RealmContext ctx{proc};
+  device_handle_t device_handle =
+      device_handle_t_from_device_specific_managed_handle(
+          task_args.device_handle, ctx.get_current_device_idx());
   execute_dynamic_node_invocation(
       /*invocation=*/*task_args.invocation,
       /*allocator=*/ctx.get_current_device_allocator(),
       /*profiling_settings=*/*task_args.profiling_settings,
-      /*ff_handle=*/ctx.get_current_device_handle(),
+      /*ff_handle=*/device_handle,
       /*per_device_op_state=*/
       transform(task_args.invocation->node_attrs.per_device_op_state,
                 [&](DeviceSpecificPerDeviceOpState const &op_state) {
@@ -58,15 +64,18 @@ void op_task_body(void const *args,
       /*device_idx=*/ctx.get_current_device_idx());
 }
 
-Realm::Event spawn_op_task(RealmContext &ctx,
-                           Realm::Processor target_proc,
-                           DynamicNodeInvocation const &invocation,
-                           ProfilingSettings const &profiling_settings,
-                           FFIterationConfig const &iteration_config,
-                           std::optional<OptimizerAttrs> const &optimizer_attrs,
-                           Realm::Event precondition) {
+Realm::Event
+    spawn_op_task(RealmContext &ctx,
+                  Realm::Processor target_proc,
+                  DynamicNodeInvocation const &invocation,
+                  ProfilingSettings const &profiling_settings,
+                  DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+                  FFIterationConfig const &iteration_config,
+                  std::optional<OptimizerAttrs> const &optimizer_attrs,
+                  Realm::Event precondition) {
   OpTaskArgs task_args{&invocation,
                        &profiling_settings,
+                       device_handle,
                        &iteration_config,
                        &optimizer_attrs,
                        ctx.get_current_processor()};

From 4f9dbbccd5dda733073397add5d9f591e5295d15 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 17:02:00 -0800
Subject: [PATCH 048/113] Test distributed device handle.

---
 .../realm-execution/distributed_device_handle.h    |  6 ++++--
 .../realm-execution/distributed_device_handle.cc   |  7 ++++---
 .../realm-execution/tasks/realm_task_registry.cc   | 14 ++++++++++++++
 .../test/src/realm-execution/realm_manager.cc      | 14 ++++++++++++--
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
index 3f55c47192..40f3b98fb3 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
@@ -13,14 +13,16 @@ struct DistributedDeviceHandle {
 public:
   DistributedDeviceHandle() = delete;
   explicit DistributedDeviceHandle(
-      std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle> const
+      std::unordered_map<Realm::Processor,
+                         DeviceSpecificManagedPerDeviceFFHandle> const
           &handles);
 
   DeviceSpecificManagedPerDeviceFFHandle const &
       at(Realm::Processor processor) const;
 
 private:
-  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle> handles;
+  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
+      handles;
 };
 
 DistributedDeviceHandle create_distributed_device_handle(
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
index 404feb014c..3cd01f292e 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
@@ -6,8 +6,8 @@
 namespace FlexFlow {
 
 DistributedDeviceHandle::DistributedDeviceHandle(
-    std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle> const
-        &handles)
+    std::unordered_map<Realm::Processor,
+                       DeviceSpecificManagedPerDeviceFFHandle> const &handles)
     : handles(handles) {}
 
 DeviceSpecificManagedPerDeviceFFHandle const &
@@ -20,7 +20,8 @@ DistributedDeviceHandle
                                      size_t workSpaceSize,
                                      bool allowTensorOpMathConversion,
                                      Realm::Event precondition) {
-  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle> handles;
+  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
+      handles;
 
   // Allocate space for the result before launching any tasks
   Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index 9150ce6892..cff12c2391 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -1,5 +1,7 @@
 #include "realm-execution/tasks/realm_task_registry.h"
 #include "realm-execution/tasks/impl/controller_task.h"
+#include "realm-execution/tasks/impl/device_handle_init_return_task.h"
+#include "realm-execution/tasks/impl/device_handle_init_task.h"
 #include "realm-execution/tasks/impl/device_state_init_return_task.h"
 #include "realm-execution/tasks/impl/device_state_init_task.h"
 #include "realm-execution/tasks/impl/op_task.h"
@@ -125,6 +127,18 @@ Realm::Event register_all_tasks() {
   pending_registrations.push_back(register_task(Realm::Processor::LOC_PROC,
                                                 task_id_t::CONTROLLER_TASK_ID,
                                                 controller_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                    device_handle_init_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::TOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                    device_handle_init_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_RETURN_TASK_ID,
+                    device_handle_init_return_task_body));
   pending_registrations.push_back(
       register_task(Realm::Processor::LOC_PROC,
                     task_id_t::DEVICE_STATE_INIT_RETURN_TASK_ID,
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index 94e0d7d0f4..41fa63f4f9 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/realm_manager.h"
+#include "realm-execution/distributed_device_handle.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -16,8 +17,17 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // Launch a controller
     int some_data = 123;
-    FlexFlow::Realm::Event event = manager.start_controller(
-        [&](RealmContext &ctx) { ASSERT(some_data == 123); });
+    FlexFlow::Realm::Event event =
+        manager.start_controller([&](RealmContext &ctx) {
+          // Data is captured and retains value
+          ASSERT(some_data == 123);
+
+          // Launch some basic task to ensure everything works
+          DistributedDeviceHandle handle = create_distributed_device_handle(
+              /*ctx=*/ctx,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+        });
     // Need to block on the completion of the event to ensure we don't race
     event.wait();
   }

From 672330ca778a7964e633237da6d550ffb1c5e354 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 20:54:38 -0800
Subject: [PATCH 049/113] Guard the kinds of procs we run on.

---
 .../src/realm-execution/distributed_device_handle.cc     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
index 3cd01f292e..87376be9b1 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
@@ -26,9 +26,12 @@ DistributedDeviceHandle
   // Allocate space for the result before launching any tasks
   Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
   for (Realm::Processor proc : pq) {
-    handles.insert({proc,
-                    make_device_specific_managed_handle(
-                        ctx.get_current_device_idx(), std::nullopt)});
+    if (proc.kind() == Realm::Processor::LOC_PROC ||
+        proc.kind() == Realm::Processor::TOC_PROC) {
+      handles.insert({proc,
+                      make_device_specific_managed_handle(
+                          ctx.get_current_device_idx(), std::nullopt)});
+    }
   }
 
   for (auto &[proc, handle] : handles) {

From 2fb0b34ad4f18d14284f0df27e9954b2cbb5c456 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 13 Feb 2026 10:30:17 -0800
Subject: [PATCH 050/113] Switch to own DeviceSpecific implementation with raw
 pointers.

---
 ...pecific_managed_per_device_ff_handle.dtg.toml | 16 ----------------
 ...evice_specific_managed_per_device_ff_handle.h | 14 +++++++++++++-
 .../realm-execution/distributed_device_handle.h  |  2 +-
 .../tasks/impl/device_handle_init_return_task.h  |  2 +-
 .../tasks/impl/device_handle_init_task.h         |  2 +-
 .../tasks/impl/device_state_init_task.h          |  2 +-
 .../include/realm-execution/tasks/impl/op_task.h |  2 +-
 ...vice_specific_managed_per_device_ff_handle.cc | 16 ++++++++++++----
 8 files changed, 30 insertions(+), 26 deletions(-)
 delete mode 100644 lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.dtg.toml

diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.dtg.toml b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.dtg.toml
deleted file mode 100644
index 1458adcba3..0000000000
--- a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.dtg.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-namespace = "FlexFlow"
-name = "DeviceSpecificManagedPerDeviceFFHandle"
-type = "struct"
-features = [
-  "eq",
-]
-
-includes = [
-  "<optional>",
-  "kernels/managed_per_device_ff_handle.h",
-  "task-spec/device_specific.h",
-]
-
-[[fields]]
-name = "handle"
-type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::ManagedPerDeviceFFHandle *>>"
diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
index eefa6c86ac..19a70491a2 100644
--- a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
@@ -4,10 +4,22 @@
 #include "kernels/device_handle_t.dtg.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "pcg/device_id_t.dtg.h"
-#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
 
 namespace FlexFlow {
 
+struct DeviceSpecificManagedPerDeviceFFHandle {
+public:
+  DeviceSpecificManagedPerDeviceFFHandle() = delete;
+  explicit DeviceSpecificManagedPerDeviceFFHandle(
+      device_id_t owner, std::optional<ManagedPerDeviceFFHandle *> handle);
+
+  std::optional<ManagedPerDeviceFFHandle *> get(device_id_t device_idx) const;
+
+private:
+  device_id_t owner;
+  std::optional<ManagedPerDeviceFFHandle *> handle;
+};
+
 DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
     device_id_t const &, std::optional<ManagedPerDeviceFFHandle *> const &);
 
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
index 40f3b98fb3..268be3583d 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
 
-#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/hash/processor.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
index 9bae546403..a87652b5ce 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
 
-#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
index 624eb6e682..312ed26add 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
 
-#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
index 933d4f9283..4ed8c1726d 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
@@ -3,7 +3,7 @@
 
 #include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 847154192a..9d4c2fd451 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -4,7 +4,7 @@
 #include "kernels/profiling_settings.dtg.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "realm-execution/device_specific_managed_per_device_ff_handle.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
index 440b9d18f7..99ff7a6dd6 100644
--- a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -3,19 +3,27 @@
 
 namespace FlexFlow {
 
+DeviceSpecificManagedPerDeviceFFHandle::DeviceSpecificManagedPerDeviceFFHandle(
+    device_id_t owner, std::optional<ManagedPerDeviceFFHandle *> handle)
+    : owner(owner), handle(handle) {}
+
+std::optional<ManagedPerDeviceFFHandle *>
+    DeviceSpecificManagedPerDeviceFFHandle::get(device_id_t device_idx) const {
+  ASSERT(this->owner == device_idx);
+  return this->handle;
+}
+
 DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
     device_id_t const &device_id,
     std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
-  return DeviceSpecificManagedPerDeviceFFHandle{
-      DeviceSpecific<std::optional<ManagedPerDeviceFFHandle *>>::create(
-          device_id, managed_handle)};
+  return DeviceSpecificManagedPerDeviceFFHandle{device_id, managed_handle};
 }
 
 device_handle_t device_handle_t_from_device_specific_managed_handle(
     DeviceSpecificManagedPerDeviceFFHandle const &device_specific,
     device_id_t device_idx) {
   return device_handle_t_from_managed_handle_ptr(
-      *device_specific.handle.get(device_idx));
+      *device_specific.get(device_idx));
 }
 
 } // namespace FlexFlow

From cc64592fd3c786df836871baa3242772119c01c7 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 13 Feb 2026 10:57:11 -0800
Subject: [PATCH 051/113] Separate device handle test.

---
 .../distributed_device_handle.cc              | 38 +++++++++++++++++++
 .../test/src/realm-execution/realm_manager.cc | 16 ++++----
 .../test/src/realm-execution/test_e2e.cc      |  5 +++
 3 files changed, 51 insertions(+), 8 deletions(-)
 create mode 100644 lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc

diff --git a/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
new file mode 100644
index 0000000000..5a5402a140
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
@@ -0,0 +1,38 @@
+#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/realm_manager.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("DistributedDeviceHandle") {
+    // Construct some fake command line for our test
+    char fake_executable_name[] = "fake_executable_name";
+    char arg0[] = "-ll:cpu";
+    char arg1[] = "2";
+    std::vector<char *> fake_args{fake_executable_name, arg0, arg1};
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      DistributedDeviceHandle handle = create_distributed_device_handle(
+          /*ctx=*/ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      // Make sure we have handles for the processors we're expecting
+      Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+      pq.only_kind(Realm::Processor::LOC_PROC);
+      for (Realm::Processor proc : pq) {
+        handle.at(proc);
+      }
+    });
+  }
+}
+
+} // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index 41fa63f4f9..5fe659cdc2 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -2,7 +2,10 @@
 #include "realm-execution/distributed_device_handle.h"
 #include <doctest/doctest.h>
 
+namespace test {
+
 using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("RealmManager") {
@@ -17,18 +20,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // Launch a controller
     int some_data = 123;
-    FlexFlow::Realm::Event event =
+    Realm::Event event =
         manager.start_controller([&](RealmContext &ctx) {
           // Data is captured and retains value
           ASSERT(some_data == 123);
-
-          // Launch some basic task to ensure everything works
-          DistributedDeviceHandle handle = create_distributed_device_handle(
-              /*ctx=*/ctx,
-              /*workSpaceSize=*/1024 * 1024,
-              /*allowTensorOpMathConversion=*/true);
         });
-    // Need to block on the completion of the event to ensure we don't race
+    // Need to block on the completion of the event to ensure we don't race,
+    // because the lambda captures the environment
     event.wait();
   }
 }
+
+} // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 37f1a9b42c..9592cb221c 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -2,7 +2,10 @@
 #include "realm-execution/realm_manager.h"
 #include <doctest/doctest.h>
 
+namespace test {
+
 using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("RealmBackend e2e Training") {
@@ -14,3 +17,5 @@ TEST_SUITE(FF_TEST_SUITE) {
     (void)manager.start_controller([](RealmContext &ctx) {});
   }
 }
+
+} // namespace test

From c462ff3bc4dee49e174614456f0c8c0794035b98 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 13 Feb 2026 12:26:57 -0800
Subject: [PATCH 052/113] More work on Realm tests.

---
 .../parallel_computation_graph.h              |   4 +
 .../parallel_computation_graph.cc             |  21 +++
 .../pcg_instance/pcg_instance.h               |  32 +++-
 .../pcg_instance/pcg_instance.cc              |  10 +-
 .../test/src/internal/realm_test_utils.cc     |  28 +++
 .../test/src/internal/realm_test_utils.h      |  15 ++
 .../distributed_device_handle.cc              |   8 +-
 .../test/src/realm-execution/realm_manager.cc |  15 +-
 .../test/src/realm-execution/test_e2e.cc      | 173 +++++++++++++++++-
 9 files changed, 283 insertions(+), 23 deletions(-)
 create mode 100644 lib/realm-execution/test/src/internal/realm_test_utils.cc
 create mode 100644 lib/realm-execution/test/src/internal/realm_test_utils.h

diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 3d948ac107..21f33f6d3d 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -32,6 +32,10 @@ ParallelLayerAddedResult add_parallel_layer(
 ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
                                              TensorShape const &tensor_shape);
 
+ParallelLayerAddedResult
+    pcg_add_input_layer_with_grad(ParallelComputationGraph &pcg,
+                                  TensorShape const &tensor_shape);
+
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer);
 
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index 907dc05620..959747dbc7 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -142,6 +142,27 @@ ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
                             });
 }
 
+ParallelLayerAddedResult
+    pcg_add_input_layer_with_grad(ParallelComputationGraph &pcg,
+                                  TensorShape const &tensor_shape) {
+  ParallelLayerAttrs layer_attrs = ParallelLayerAttrs{
+      /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
+      /*name=*/std::nullopt,
+  };
+
+  return add_parallel_layer(/*pcg=*/pcg,
+                            /*layer_attrs=*/layer_attrs,
+                            /*inputs=*/{},
+                            /*weights=*/{},
+                            /*output_flags=*/
+                            std::unordered_map<TensorSlotName, CreateGrad>{
+                                {
+                                    TensorSlotName::OUTPUT,
+                                    CreateGrad::YES,
+                                },
+                            });
+}
+
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer) {
   PCGOperatorAttrs op_attrs = pcg_get_op_attrs(pcg, layer);
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
index b917477df4..b0037f51b2 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_PCG_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_PCG_INSTANCE_H
 
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
@@ -57,6 +57,34 @@ PCGInstance create_pcg_instance(
     DistributedDeviceHandle const &device_handle,
     FFIterationConfig const &iteration_config);
 
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_all_passes_for_pcg_instance(
+        PCGInstance &instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_forward_pass_for_pcg_instance(
+        PCGInstance &instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_backward_pass_for_pcg_instance(
+        PCGInstance &instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_update_pass_for_pcg_instance(
+        PCGInstance &instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index c79d8e8abd..de7cdcb687 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -47,7 +47,7 @@ std::optional<Realm::RegionInstance>
   return this->logit_grad_tensor;
 }
 
-PCGInstance create_parallel_computation_graph_instance(
+PCGInstance create_pcg_instance(
     RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
@@ -160,7 +160,7 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 }
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
-    perform_all_passes_for_parallel_computation_graph_instance(
+    perform_all_passes_for_pcg_instance(
         PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
@@ -180,7 +180,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 }
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
-    perform_forward_pass_for_parallel_computation_graph_instance(
+    perform_forward_pass_for_pcg_instance(
         PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
@@ -203,7 +203,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 }
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
-    perform_backward_pass_for_parallel_computation_graph_instance(
+    perform_backward_pass_for_pcg_instance(
         PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
@@ -226,7 +226,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 }
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
-    perform_update_pass_for_parallel_computation_graph_instance(
+    perform_update_pass_for_pcg_instance(
         PCGInstance &instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
diff --git a/lib/realm-execution/test/src/internal/realm_test_utils.cc b/lib/realm-execution/test/src/internal/realm_test_utils.cc
new file mode 100644
index 0000000000..e381feb8de
--- /dev/null
+++ b/lib/realm-execution/test/src/internal/realm_test_utils.cc
@@ -0,0 +1,28 @@
+#include "internal/realm_test_utils.h"
+#include <fmt/format.h>
+#include <string>
+
+namespace FlexFlow {
+
+static char *leak_string_contents(std::string const &str) {
+  // Realm command-line arguments require char* so intentionally leak the
+  // allocated string contents here
+  std::vector<char> *content = new std::vector<char>{str.begin(), str.end()};
+  content->push_back(0); // NUL byte
+  return content->data();
+}
+
+std::vector<char *> make_fake_realm_args(positive_int num_cpus,
+                                         nonnegative_int num_gpus) {
+  std::vector<char *> result;
+  result.push_back(leak_string_contents("fake_executable_name"));
+  result.push_back(leak_string_contents("-ll:cpu"));
+  result.push_back(leak_string_contents(fmt::to_string(num_cpus)));
+  if (num_gpus > 0) {
+    result.push_back(leak_string_contents("-ll:gpu"));
+    result.push_back(leak_string_contents(fmt::to_string(num_gpus)));
+  }
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/internal/realm_test_utils.h b/lib/realm-execution/test/src/internal/realm_test_utils.h
new file mode 100644
index 0000000000..8e2775ad8b
--- /dev/null
+++ b/lib/realm-execution/test/src/internal/realm_test_utils.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_TEST_SRC_INTERNAL_REALM_TEST_UTILS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_TEST_SRC_INTERNAL_REALM_TEST_UTILS_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/positive_int/positive_int.h"
+#include <vector>
+
+namespace FlexFlow {
+
+std::vector<char *> make_fake_realm_args(positive_int num_cpus,
+                                         nonnegative_int num_gpus);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
index 5a5402a140..fb7dff01e3 100644
--- a/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
+++ b/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/distributed_device_handle.h"
+#include "internal/realm_test_utils.h"
 #include "realm-execution/realm_manager.h"
 #include <doctest/doctest.h>
 
@@ -9,11 +10,8 @@ namespace Realm = ::FlexFlow::Realm;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("DistributedDeviceHandle") {
-    // Construct some fake command line for our test
-    char fake_executable_name[] = "fake_executable_name";
-    char arg0[] = "-ll:cpu";
-    char arg1[] = "2";
-    std::vector<char *> fake_args{fake_executable_name, arg0, arg1};
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
 
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index 5fe659cdc2..450d7fd3ec 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/realm_manager.h"
+#include "internal/realm_test_utils.h"
 #include "realm-execution/distributed_device_handle.h"
 #include <doctest/doctest.h>
 
@@ -9,9 +10,8 @@ namespace Realm = ::FlexFlow::Realm;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("RealmManager") {
-    // Construct some fake command line for our test
-    char fake_executable_name[] = "fake_executable_name";
-    std::vector<char *> fake_args{fake_executable_name};
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/0_n);
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
 
@@ -20,11 +20,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // Launch a controller
     int some_data = 123;
-    Realm::Event event =
-        manager.start_controller([&](RealmContext &ctx) {
-          // Data is captured and retains value
-          ASSERT(some_data == 123);
-        });
+    Realm::Event event = manager.start_controller([&](RealmContext &ctx) {
+      // Data is captured and retains value
+      ASSERT(some_data == 123);
+    });
     // Need to block on the completion of the event to ensure we don't race,
     // because the lambda captures the environment
     event.wait();
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 9592cb221c..33ad2bbbc1 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -1,5 +1,12 @@
+#include "internal/realm_test_utils.h"
+#include "kernels/allocation.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_device_handle.h"
 #include "realm-execution/pcg_instance/pcg_instance.h"
 #include "realm-execution/realm_manager.h"
+#include "utils/containers/require_only_key.h"
 #include <doctest/doctest.h>
 
 namespace test {
@@ -9,12 +16,172 @@ namespace Realm = ::FlexFlow::Realm;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("RealmBackend e2e Training") {
-    char fake_executable_name[] = "fake_executable_name";
-    std::vector<char *> fake_args{fake_executable_name};
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/0_n);
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
+
     RealmManager manager(&fake_argc, &fake_argv);
-    (void)manager.start_controller([](RealmContext &ctx) {});
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      positive_int batch_size = 10_p;
+      positive_int data_dim = 16_p;
+      positive_int hidden_dim = 32_p;
+      positive_int output_dim = 1_p;
+
+      TensorShape output_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+      GenericTensorAccessorW label_tensor_backing =
+          allocator.allocate_tensor(output_tensor_shape);
+
+      // construct computation graph
+      ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+      TensorShape label_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      TensorShape weight_shape_1 = TensorShape{
+          TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
+      TensorShape weight_shape_2 = TensorShape{
+          TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
+
+      ParallelLayerAddedResult inputs_layer =
+          pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input =
+          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_1 =
+          require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_2 =
+          require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{hidden_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_input,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_1,
+              },
+          });
+      parallel_tensor_guid_t t_linear_1 =
+          require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{output_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_linear_1,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_2,
+              },
+          });
+      parallel_tensor_guid_t t_linear_2 =
+          require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
+
+      MappedParallelComputationGraph mpcg{pcg, {}};
+
+      // instantiate computation graph
+      LossAttrs loss_attrs = LossAttrs{
+          NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                           /*momentum=*/0.9,
+                                           /*nesterov=*/false,
+                                           /*weight_decay=*/0.001}};
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedDeviceHandle device_handle = create_distributed_device_handle(
+          ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      PCGInstance pcg_instance = create_pcg_instance(
+          /*ctx=*/ctx,
+          /*mpcg=*/mpcg,
+          /*optimizer=*/optimizer_attrs,
+          /*loss=*/loss_attrs,
+          /*label_tensor=*/label_tensor,
+          /*logit_tensor=*/t_linear_2,
+          /*input_tensors=*/input_tensors,
+          /*profiling_settings=*/ProfilingSettings{0, 0},
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/FFIterationConfig{1_p});
+
+      // begin training loop
+      int num_epochs = 5;
+      std::vector<GenericTensorAccessorR> loss_values;
+
+      for (int i = 0; i < num_epochs; i++) {
+        perform_all_passes_for_pcg_instance(
+            /*instance=*/pcg_instance,
+            /*profiling_settings=*/ProfilingSettings{0, 0},
+            /*device_handle=*/device_handle,
+            /*iteration_config=*/FFIterationConfig{1_p});
+        // loss_values.push_back(copy_tensor_accessor_r(
+        //     pcg_instance.get_loss_tensor_accessor().value(),
+        //     allocator));
+      }
+
+      // // Assert that each sample in the batch has a lower loss in last epoch
+      // // than the first epoch
+      // GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+      // GenericTensorAccessorR last_epoch_loss = loss_values.back();
+      // CHECK_MESSAGE(did_loss_decrease(first_epoch_loss, last_epoch_loss),
+      //               check_kv("first_epoch_loss",
+      //                        format_accessor_r_contents(first_epoch_loss)),
+      //               check_kv("last_epoch_loss",
+      //                        format_accessor_r_contents(last_epoch_loss)));
+    });
   }
 }
 

From 0ba568099de39a88a7769fa48d5fbe4f7c1acec7 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 13 Feb 2026 16:55:22 -0800
Subject: [PATCH 053/113] JSON serialization of a bunch of data types.

---
 lib/pcg/include/pcg/layer_guid_t.dtg.toml     |  1 +
 .../mapped_operator_task_group.h              | 12 ++++++
 .../parallel_layer_guid_t.dtg.toml            |  1 +
 .../mapped_operator_task_group.cc             | 17 ++++++++
 .../mapped_operator_task_group.cc             | 42 ++++++++++++++++++
 .../dynamic_layer_guid_t.dtg.toml             |  1 +
 .../serializable_dynamic_node_attrs.dtg.toml  | 43 +++++++++++++++++++
 ...ializable_dynamic_node_invocation.dtg.toml | 33 ++++++++++++++
 .../serializable_dynamic_value_attrs.dtg.toml | 34 +++++++++++++++
 .../training_operation_attrs.dtg.toml         |  1 +
 10 files changed, 185 insertions(+)
 create mode 100644 lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml

diff --git a/lib/pcg/include/pcg/layer_guid_t.dtg.toml b/lib/pcg/include/pcg/layer_guid_t.dtg.toml
index d73cf547da..2f2f7694a0 100644
--- a/lib/pcg/include/pcg/layer_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/layer_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h b/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
index 5b1cad5e99..ebfdefa478 100644
--- a/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
+++ b/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
@@ -5,6 +5,7 @@
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
 #include "utils/bidict/bidict.h"
+#include <nlohmann/json.hpp>
 
 namespace FlexFlow {
 
@@ -45,4 +46,15 @@ struct hash<::FlexFlow::MappedOperatorTaskGroup> {
 };
 
 } // namespace std
+
+namespace nlohmann {
+
+template <>
+struct adl_serializer<::FlexFlow::MappedOperatorTaskGroup> {
+  static ::FlexFlow::MappedOperatorTaskGroup from_json(json const &j);
+  static void to_json(json &j, ::FlexFlow::MappedOperatorTaskGroup const &t);
+};
+
+} // namespace nlohmann
+
 #endif
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml b/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
index 618bcb0dc4..292b361fc8 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc b/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
index b96a447383..4436efd727 100644
--- a/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
+++ b/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
@@ -90,3 +90,20 @@ size_t hash<::FlexFlow::MappedOperatorTaskGroup>::operator()(
 }
 
 } // namespace std
+
+namespace nlohmann {
+
+::FlexFlow::MappedOperatorTaskGroup
+    adl_serializer<::FlexFlow::MappedOperatorTaskGroup>::from_json(
+        json const &j) {
+  return ::FlexFlow::MappedOperatorTaskGroup{j.template get<
+      ::FlexFlow::bidict<::FlexFlow::MachineSpaceCoordinate,
+                         ::FlexFlow::OperatorAtomicTaskShardBinding>>()};
+}
+
+void adl_serializer<::FlexFlow::MappedOperatorTaskGroup>::to_json(
+    json &j, ::FlexFlow::MappedOperatorTaskGroup const &t) {
+  j = t.get_shard_bindings();
+}
+
+} // namespace nlohmann
diff --git a/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc b/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
new file mode 100644
index 0000000000..1c3667afc7
--- /dev/null
+++ b/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
@@ -0,0 +1,42 @@
+#include "pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h"
+#include "op-attrs/parallel_tensor_space_coordinate.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include <doctest/doctest.h>
+#include <nlohmann/json.hpp>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("adl_serializer<MappedOperatorTaskGroup>") {
+    bidict<MachineSpaceCoordinate, OperatorAtomicTaskShardBinding>
+        shard_bindings{
+            {MachineSpaceCoordinate{0_n, 0_n, DeviceType::CPU},
+             OperatorAtomicTaskShardBinding{
+                 {
+                     {TensorSlotName::INPUT,
+                      ParallelTensorSpaceCoordinate{
+                          0_n, 0_n, FFOrdered{1_n, 2_n, 3_n}}},
+                 },
+             }},
+        };
+    MappedOperatorTaskGroup deserialized{shard_bindings};
+    nlohmann::json serialized = shard_bindings;
+
+    SUBCASE("to_json") {
+      nlohmann::json result = deserialized;
+      nlohmann::json correct = serialized;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("from_json") {
+      MappedOperatorTaskGroup result = serialized;
+      MappedOperatorTaskGroup correct = deserialized;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
index c6e6673f33..bd64f52567 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml
new file mode 100644
index 0000000000..3c43e1d637
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml
@@ -0,0 +1,43 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicNodeAttrs"
+type = "struct"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "<optional>",
+  "task-spec/dynamic_graph/dynamic_task_type.dtg.h",
+  "pcg/machine_space_coordinate.dtg.h",
+  "pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h",
+  "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h",
+  "task-spec/dynamic_graph/training_operation_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "task_type"
+type = "std::optional<::FlexFlow::DynamicTaskType>"
+
+[[fields]]
+name = "device_coord"
+type = "std::optional<::FlexFlow::MachineSpaceCoordinate>"
+
+[[fields]]
+name = "mapping"
+type = "std::optional<::FlexFlow::MappedOperatorTaskGroup>"
+
+[[fields]]
+name = "op_attrs"
+type = "std::optional<::FlexFlow::TrainingOperationAttrs>"
+
+[[fields]]
+name = "layer_guid"
+type = "::FlexFlow::dynamic_layer_guid_t"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml
new file mode 100644
index 0000000000..01f4cc8876
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml
@@ -0,0 +1,33 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicNodeInvocation"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "<unordered_map>",
+  "task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.h",
+  "task-spec/dynamic_graph/dynamic_tensor_slot.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+]
+
+[[fields]]
+name = "inputs"
+type = "std::unordered_map<::FlexFlow::DynamicTensorSlot, ::FlexFlow::SerializableDynamicValueAttrs>"
+
+[[fields]]
+name = "node_attrs"
+type = "::FlexFlow::SerializableDynamicNodeAttrs"
+
+[[fields]]
+name = "outputs"
+type = "std::unordered_map<::FlexFlow::DynamicTensorSlot, ::FlexFlow::SerializableDynamicValueAttrs>"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
new file mode 100644
index 0000000000..05864b4b47
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
@@ -0,0 +1,34 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicValueAttrs"
+type = "struct"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "<optional>",
+  "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h",
+  "op-attrs/parallel_tensor_shape.dtg.h",
+  "op-attrs/parallel_tensor_space_coordinate.dtg.h",
+  "task-spec/dynamic_graph/dynamic_tensor_role.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "parallel_tensor_shape"
+type = "std::optional<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "shard_coord"
+type = "std::optional<::FlexFlow::ParallelTensorSpaceCoordinate>"
+
+[[fields]]
+name = "role"
+type = "std::optional<::FlexFlow::DynamicTensorRole>"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
index 66c475b3a9..1051d8ac13 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [

From bb9d45b4ba3cac793751301038258bdffc55ac91 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 13 Feb 2026 17:20:49 -0800
Subject: [PATCH 054/113] Make more stuff serializable.

---
 .../parallel_tensor_guid_t.dtg.toml           |  1 +
 lib/pcg/include/pcg/tensor_guid_t.dtg.toml    |  1 +
 .../dynamic_tensor_guid_t.dtg.toml            |  1 +
 .../serializable_dynamic_value_attrs.dtg.toml |  4 +++
 .../serializable_dynamic_value_attrs.h        | 16 +++++++++++
 .../serializable_dynamic_value_attrs.cc       | 27 +++++++++++++++++++
 .../kwarg_dataflow_output.dtg.toml            |  1 +
 7 files changed, 51 insertions(+)
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h
 create mode 100644 lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc

diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml b/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
index 4494a31ac2..2710a15664 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/include/pcg/tensor_guid_t.dtg.toml b/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
index 151f7b1f0f..e8caf0021f 100644
--- a/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
index 75e9099104..c9171b928b 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
index 05864b4b47..6209bfa247 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
@@ -21,6 +21,10 @@ src_includes = [
   "utils/json/optional.h",
 ]
 
+[[fields]]
+name = "tensor_guid"
+type = "::FlexFlow::dynamic_tensor_guid_t"
+
 [[fields]]
 name = "parallel_tensor_shape"
 type = "std::optional<::FlexFlow::ParallelTensorShape>"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h
new file mode 100644
index 0000000000..6272265b7e
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_VALUE_ATTRS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_VALUE_ATTRS_H
+
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicValueAttrs
+    dynamic_value_attrs_to_serializable(DynamicValueAttrs const &);
+DynamicValueAttrs dynamic_value_attrs_from_serializable(
+    SerializableDynamicValueAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc
new file mode 100644
index 0000000000..2dc0b509ab
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc
@@ -0,0 +1,27 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include <optional>
+
+namespace FlexFlow {
+
+SerializableDynamicValueAttrs
+    dynamic_value_attrs_to_serializable(DynamicValueAttrs const &attrs) {
+  return SerializableDynamicValueAttrs{
+      /*tensor_guid=*/attrs.tensor_guid,
+      /*parallel_tensor_shape=*/attrs.parallel_tensor_shape,
+      /*shard_coord=*/attrs.shard_coord,
+      /*role=*/attrs.role,
+  };
+}
+
+DynamicValueAttrs dynamic_value_attrs_from_serializable(
+    SerializableDynamicValueAttrs const &attrs) {
+  return DynamicValueAttrs{
+      /*tensor_guid=*/attrs.tensor_guid,
+      /*parallel_tensor_shape=*/attrs.parallel_tensor_shape,
+      /*shard_coord=*/attrs.shard_coord,
+      /*accessor=*/std::nullopt,
+      /*role=*/attrs.role,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml b/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
index f286fb90a7..5b537eac88 100644
--- a/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
+++ b/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 template_params = [

From 56426ce2c23175fc429d1423268a061d1de73b88 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 13 Feb 2026 22:18:55 -0800
Subject: [PATCH 055/113] To-do notes.

---
 .../src/realm-execution/pcg_instance/pcg_instance.cc          | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index de7cdcb687..199f2dc090 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -111,6 +111,10 @@ PCGInstance create_pcg_instance(
   //  * external instances
   //  * task argument serializer
   //  * copies
+  //  * parallel operator implementation (partition, reduce, gather, etc.)
+  //  * and fused parallel operators (reduce + broadcast = allreduce)
+  //  * memory-optimizing compiler integration (tensor creation/destruction,
+  //  tensor reuse)
 }
 
 static std::unordered_map<dynamic_layer_guid_t, Realm::Event>

From c1d828b511cf174add4469b145593d8ca4620cc6 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 14 Feb 2026 12:17:15 -0800
Subject: [PATCH 056/113] More serialization routines.

---
 .../serializable_dynamic_node_attrs.h         | 16 ++++++++++
 .../serializable_dynamic_node_invocation.h    | 16 ++++++++++
 .../serializable_dynamic_node_attrs.cc        | 29 +++++++++++++++++
 .../serializable_dynamic_node_invocation.cc   | 31 +++++++++++++++++++
 4 files changed, 92 insertions(+)
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h
 create mode 100644 lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc
 create mode 100644 lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc

diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h
new file mode 100644
index 0000000000..7a274a1e7b
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_ATTRS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_ATTRS_H
+
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeAttrs
+    dynamic_node_attrs_to_serializable(DynamicNodeAttrs const &);
+DynamicNodeAttrs
+    dynamic_node_attrs_from_serializable(SerializableDynamicNodeAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h
new file mode 100644
index 0000000000..2bcdb9a898
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_INVOCATION_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_INVOCATION_H
+
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeInvocation
+    dynamic_node_invocation_to_serializable(DynamicNodeInvocation const &);
+DynamicNodeInvocation dynamic_node_invocation_from_serializable(
+    SerializableDynamicNodeInvocation const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc
new file mode 100644
index 0000000000..d613194d14
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc
@@ -0,0 +1,29 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.h"
+#include <optional>
+
+namespace FlexFlow {
+
+SerializableDynamicNodeAttrs
+    dynamic_node_attrs_to_serializable(DynamicNodeAttrs const &attrs) {
+  return SerializableDynamicNodeAttrs{
+      /*task_type=*/attrs.task_type,
+      /*device_coord=*/attrs.device_coord,
+      /*mapping=*/attrs.mapping,
+      /*op_attrs=*/attrs.op_attrs,
+      /*layer_guid=*/attrs.layer_guid,
+  };
+}
+
+DynamicNodeAttrs dynamic_node_attrs_from_serializable(
+    SerializableDynamicNodeAttrs const &attrs) {
+  return DynamicNodeAttrs{
+      /*task_type=*/attrs.task_type,
+      /*device_coord=*/attrs.device_coord,
+      /*mapping=*/attrs.mapping,
+      /*op_attrs=*/attrs.op_attrs,
+      /*layer_guid=*/attrs.layer_guid,
+      /*per_device_op_state=*/std::nullopt,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc
new file mode 100644
index 0000000000..334623ee67
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc
@@ -0,0 +1,31 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include "utils/containers/map_values.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeInvocation dynamic_node_invocation_to_serializable(
+    DynamicNodeInvocation const &invocation) {
+  return SerializableDynamicNodeInvocation{
+      /*inputs=*/map_values(invocation.inputs,
+                            dynamic_value_attrs_to_serializable),
+      /*node_attrs=*/dynamic_node_attrs_to_serializable(invocation.node_attrs),
+      /*outputs=*/
+      map_values(invocation.outputs, dynamic_value_attrs_to_serializable),
+  };
+}
+
+DynamicNodeInvocation dynamic_node_invocation_from_serializable(
+    SerializableDynamicNodeInvocation const &invocation) {
+  return DynamicNodeInvocation{
+      /*inputs=*/map_values(invocation.inputs,
+                            dynamic_value_attrs_from_serializable),
+      /*node_attrs=*/
+      dynamic_node_attrs_from_serializable(invocation.node_attrs),
+      /*outputs=*/
+      map_values(invocation.outputs, dynamic_value_attrs_from_serializable),
+  };
+}
+
+} // namespace FlexFlow

From 30bddfdfa8fbc6da55975dd4546d752918b50534 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 14 Feb 2026 12:18:49 -0800
Subject: [PATCH 057/113] Most of serializer finished.

---
 .../serializable_realm_processor.dtg.toml     | 17 ++++++
 .../serializer/serializable_realm_processor.h | 16 +++++
 .../tasks/serializer/task_arg_serializer.h    | 26 ++++++++
 .../tasks/impl/device_state_init_task.cc      | 61 +++++++++++++------
 .../serializable_realm_processor.cc           | 15 +++++
 5 files changed, 115 insertions(+), 20 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc

diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml
new file mode 100644
index 0000000000..3cb64d95c1
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SerializableRealmProcessor"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+[[fields]]
+name = "id"
+type = "::FlexFlow::Realm::Processor::id_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h
new file mode 100644
index 0000000000..6b29b6e223
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_PROCESSOR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_PROCESSOR_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmProcessor
+    realm_processor_to_serializable(Realm::Processor const &);
+Realm::Processor
+    realm_processor_from_serializable(SerializableRealmProcessor const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h b/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
new file mode 100644
index 0000000000..fc5abba587
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
@@ -0,0 +1,26 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_TASK_ARG_SERIALIZER_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_TASK_ARG_SERIALIZER_H
+
+#include <nlohmann/json.hpp>
+#include <string>
+#include <string_view>
+
+namespace FlexFlow {
+
+template <typename T>
+std::string serialize_task_args(T const &args) {
+  nlohmann::json j;
+  args.serialize(j);
+  return j.dump();
+}
+
+template <typename T>
+T deserialize_task_args(void const *args, size_t arglen) {
+  nlohmann::json j = nlohmann::json::parse(
+      std::string_view{reinterpret_cast<char const *>(args), arglen});
+  return T::deserialize(j);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
index 5a51b1c803..0e7730e485 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -3,11 +3,16 @@
 #include "local-execution/device_state_initialization.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/tasks/impl/device_state_init_return_task.h"
+#include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
 #include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
+#include "utils/exception.h"
 #include "utils/optional.h"
+#include <cstdint>
 #include <optional>
 #include <type_traits>
 
@@ -19,11 +24,11 @@ namespace FlexFlow {
 struct DeviceStateInitTaskArgs {
   DeviceStateInitTaskArgs() = delete;
   DeviceStateInitTaskArgs(
-      DynamicNodeInvocation const *invocation,
-      ProfilingSettings const *profiling_settings,
+      DynamicNodeInvocation const &invocation,
+      ProfilingSettings const &profiling_settings,
       DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-      FFIterationConfig const *iteration_config,
-      OptimizerAttrs const *optimizer_attrs,
+      FFIterationConfig const &iteration_config,
+      OptimizerAttrs const &optimizer_attrs,
       Realm::Processor origin_proc,
       DeviceSpecificPerDeviceOpState *origin_result_ptr)
       : invocation(invocation), profiling_settings(profiling_settings),
@@ -31,12 +36,28 @@ struct DeviceStateInitTaskArgs {
         optimizer_attrs(optimizer_attrs), origin_proc(origin_proc),
         origin_result_ptr(origin_result_ptr) {}
 
+  void serialize(nlohmann::json &j) const {
+    j = {
+        {"invocation", dynamic_node_invocation_to_serializable(invocation)},
+        {"profiling_settings", profiling_settings},
+        // {"device_handle", device_handle},
+        {"iteration_config", iteration_config},
+        {"optimizer_attrs", optimizer_attrs},
+        {"origin_proc", realm_processor_to_serializable(origin_proc)},
+        {"origin_result_ptr", reinterpret_cast<uintptr_t>(origin_result_ptr)},
+    };
+  }
+
+  static DeviceStateInitTaskArgs deserialize(nlohmann::json const &j) {
+    NOT_IMPLEMENTED();
+  }
+
 public:
-  DynamicNodeInvocation const *invocation;
-  ProfilingSettings const *profiling_settings;
+  DynamicNodeInvocation invocation;
+  ProfilingSettings profiling_settings;
   DeviceSpecificManagedPerDeviceFFHandle device_handle;
-  FFIterationConfig const *iteration_config;
-  OptimizerAttrs const *optimizer_attrs;
+  FFIterationConfig iteration_config;
+  OptimizerAttrs optimizer_attrs;
   Realm::Processor origin_proc;
   DeviceSpecificPerDeviceOpState *origin_result_ptr;
 };
@@ -46,9 +67,8 @@ void device_state_init_task_body(void const *args,
                                  void const *userdata,
                                  size_t userlen,
                                  Realm::Processor proc) {
-  ASSERT(arglen == sizeof(DeviceStateInitTaskArgs));
   DeviceStateInitTaskArgs task_args =
-      *reinterpret_cast<DeviceStateInitTaskArgs const *>(args);
+      deserialize_task_args<DeviceStateInitTaskArgs>(args, arglen);
 
   // FIXME: serialize instead of passing pointers around
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
@@ -58,12 +78,12 @@ void device_state_init_task_body(void const *args,
       device_handle_t_from_device_specific_managed_handle(
           task_args.device_handle, ctx.get_current_device_idx());
   DynamicNodeInvocation result_invocation =
-      initialize_node(*task_args.invocation,
+      initialize_node(task_args.invocation,
                       ctx.get_current_device_allocator(),
-                      *task_args.profiling_settings,
+                      task_args.profiling_settings,
                       device_handle,
-                      *task_args.iteration_config,
-                      *task_args.optimizer_attrs,
+                      task_args.iteration_config,
+                      task_args.optimizer_attrs,
                       ctx.get_current_device_idx());
   DeviceSpecificPerDeviceOpState result_state =
       assert_unwrap(result_invocation.node_attrs.per_device_op_state);
@@ -89,11 +109,11 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     DeviceSpecificPerDeviceOpState *result_ptr,
     Realm::Event precondition) {
   DeviceStateInitTaskArgs task_args{
-      &invocation,
-      &profiling_settings,
+      invocation,
+      profiling_settings,
       device_handle,
-      &iteration_config,
-      &optimizer_attrs,
+      iteration_config,
+      optimizer_attrs,
       ctx.get_current_processor(),
       result_ptr,
   };
@@ -105,10 +125,11 @@ std::optional<Realm::Event> spawn_device_state_init_task(
                         }),
                get_init_task_id_for_op_attrs);
   if (task_id.has_value()) {
+    std::string args = serialize_task_args(task_args);
     return ctx.spawn_task(target_proc,
                           assert_unwrap(task_id),
-                          &task_args,
-                          sizeof(task_args),
+                          args.data(),
+                          args.size(),
                           Realm::ProfilingRequestSet{},
                           precondition);
   }
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc
new file mode 100644
index 0000000000..b16e2891c4
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc
@@ -0,0 +1,15 @@
+#include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+
+namespace FlexFlow {
+
+SerializableRealmProcessor
+    realm_processor_to_serializable(Realm::Processor const &proc) {
+  return SerializableRealmProcessor{proc.id};
+}
+
+Realm::Processor
+    realm_processor_from_serializable(SerializableRealmProcessor const &proc) {
+  return Realm::Processor{proc.id};
+}
+
+} // namespace FlexFlow

From e258cb4754e6d97d7bd565ef3db1b46c42bdbfc8 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 14 Feb 2026 12:41:51 -0800
Subject: [PATCH 058/113] Finish serialization of device init task.

---
 ...ce_specific_managed_per_device_ff_handle.h |  6 ++++
 ...e_specific_managed_per_device_ff_handle.cc | 28 +++++++++++++++++++
 .../tasks/impl/device_state_init_task.cc      | 24 ++++++++++++++--
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
index 19a70491a2..45617ffcbf 100644
--- a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
@@ -4,6 +4,8 @@
 #include "kernels/device_handle_t.dtg.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "pcg/device_id_t.dtg.h"
+#include <nlohmann/json.hpp>
+#include <optional>
 
 namespace FlexFlow {
 
@@ -15,6 +17,10 @@ struct DeviceSpecificManagedPerDeviceFFHandle {
 
   std::optional<ManagedPerDeviceFFHandle *> get(device_id_t device_idx) const;
 
+  void serialize(nlohmann::json &j) const;
+  static DeviceSpecificManagedPerDeviceFFHandle
+      deserialize(nlohmann::json const &j);
+
 private:
   device_id_t owner;
   std::optional<ManagedPerDeviceFFHandle *> handle;
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
index 99ff7a6dd6..ea0782fd4b 100644
--- a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -1,5 +1,8 @@
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "kernels/device_handle_t.h"
+#include "utils/containers/transform.h"
+#include "utils/json/optional.h"
+#include <cstdint>
 
 namespace FlexFlow {
 
@@ -13,6 +16,31 @@ std::optional<ManagedPerDeviceFFHandle *>
   return this->handle;
 }
 
+void DeviceSpecificManagedPerDeviceFFHandle::serialize(
+    nlohmann::json &j) const {
+  j = {
+      {"owner", owner},
+      {"handle",
+       transform(handle,
+                 [](ManagedPerDeviceFFHandle *ptr) {
+                   return reinterpret_cast<uintptr_t>(ptr);
+                 })},
+  };
+}
+
+DeviceSpecificManagedPerDeviceFFHandle
+    DeviceSpecificManagedPerDeviceFFHandle::deserialize(
+        nlohmann::json const &j) {
+  return DeviceSpecificManagedPerDeviceFFHandle{
+      /*owner=*/j.at("owner").get<device_id_t>(),
+      /*handle=*/
+      transform(j.at("handle").get<std::optional<uintptr_t>>(),
+                [](uintptr_t ptrval) {
+                  return reinterpret_cast<ManagedPerDeviceFFHandle *>(ptrval);
+                }),
+  };
+}
+
 DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
     device_id_t const &device_id,
     std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
index 0e7730e485..312c3f2401 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -3,6 +3,7 @@
 #include "local-execution/device_state_initialization.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/tasks/impl/device_state_init_return_task.h"
+#include "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h"
 #include "realm-execution/tasks/serializer/serializable_realm_processor.h"
 #include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
@@ -10,7 +11,6 @@
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
 #include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
 #include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
-#include "utils/exception.h"
 #include "utils/optional.h"
 #include <cstdint>
 #include <optional>
@@ -37,10 +37,12 @@ struct DeviceStateInitTaskArgs {
         origin_result_ptr(origin_result_ptr) {}
 
   void serialize(nlohmann::json &j) const {
+    nlohmann::json j_device_handle;
+    device_handle.serialize(j_device_handle);
     j = {
         {"invocation", dynamic_node_invocation_to_serializable(invocation)},
         {"profiling_settings", profiling_settings},
-        // {"device_handle", device_handle},
+        {"device_handle", j_device_handle},
         {"iteration_config", iteration_config},
         {"optimizer_attrs", optimizer_attrs},
         {"origin_proc", realm_processor_to_serializable(origin_proc)},
@@ -49,7 +51,23 @@ struct DeviceStateInitTaskArgs {
   }
 
   static DeviceStateInitTaskArgs deserialize(nlohmann::json const &j) {
-    NOT_IMPLEMENTED();
+    return DeviceStateInitTaskArgs{
+        /*invocation=*/dynamic_node_invocation_from_serializable(
+            j.at("invocation").get<SerializableDynamicNodeInvocation>()),
+        /*profiling_settings=*/
+        j.at("profiling_settings").get<ProfilingSettings>(),
+        /*device_handle=*/
+        DeviceSpecificManagedPerDeviceFFHandle::deserialize(
+            j.at("device_handle")),
+        /*iteration_config=*/j.at("iteration_config").get<FFIterationConfig>(),
+        /*optimizer_attrs=*/j.at("optimizer_attrs").get<OptimizerAttrs>(),
+        /*origin_proc=*/
+        realm_processor_from_serializable(
+            j.at("origin_proc").get<SerializableRealmProcessor>()),
+        /*origin_result_ptr=*/
+        reinterpret_cast<DeviceSpecificPerDeviceOpState *>(
+            j.at("origin_result_ptr").get<uintptr_t>()),
+    };
   }
 
 public:

From 0d9631858c201df88b1c4516117433d596631e24 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 14 Feb 2026 14:55:30 -0800
Subject: [PATCH 059/113] Switch over to explicit DTGs for task arguments and
 serialization.

---
 ...ce_specific_managed_per_device_ff_handle.h |  5 +-
 .../device_handle_init_task_args.dtg.toml     | 26 ++++++
 .../impl/device_state_init_task_args.dtg.toml | 42 ++++++++++
 ...able_device_handle_init_task_args.dtg.toml | 30 +++++++
 ...erializable_device_handle_init_task_args.h | 17 ++++
 ...zable_device_state_init_task_args.dtg.toml | 48 +++++++++++
 ...serializable_device_state_init_task_args.h | 16 ++++
 .../serializable_device_specific_ptr.dtg.toml | 28 +++++++
 .../tasks/serializer/task_arg_serializer.h    |  5 +-
 ...e_specific_managed_per_device_ff_handle.cc | 24 +++---
 .../pcg_instance/pcg_instance.cc              |  1 +
 .../tasks/impl/device_handle_init_task.cc     | 35 ++------
 .../tasks/impl/device_state_init_task.cc      | 82 ++-----------------
 ...rializable_device_handle_init_task_args.cc | 28 +++++++
 ...erializable_device_state_init_task_args.cc | 36 ++++++++
 15 files changed, 304 insertions(+), 119 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc

diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
index 45617ffcbf..d48a80f438 100644
--- a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
@@ -4,6 +4,7 @@
 #include "kernels/device_handle_t.dtg.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "pcg/device_id_t.dtg.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h"
 #include <nlohmann/json.hpp>
 #include <optional>
 
@@ -17,9 +18,9 @@ struct DeviceSpecificManagedPerDeviceFFHandle {
 
   std::optional<ManagedPerDeviceFFHandle *> get(device_id_t device_idx) const;
 
-  void serialize(nlohmann::json &j) const;
+  SerializableDeviceSpecificPtr serialize() const;
   static DeviceSpecificManagedPerDeviceFFHandle
-      deserialize(nlohmann::json const &j);
+      deserialize(SerializableDeviceSpecificPtr const &j);
 
 private:
   device_id_t owner;
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml
new file mode 100644
index 0000000000..c0ba37bb5d
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "DeviceHandleInitTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/realm.h",
+  "realm-execution/tasks/serializer/serializable_realm_processor.h",
+]
+
+[[fields]]
+name = "workSpaceSize"
+type = "size_t"
+
+[[fields]]
+name = "allowTensorOpMathConversion"
+type = "bool"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::Realm::Processor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle *"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
new file mode 100644
index 0000000000..a9aa77dde9
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
@@ -0,0 +1,42 @@
+namespace = "FlexFlow"
+name = "DeviceStateInitTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/realm.h",
+  "task-spec/device_specific_per_device_op_state.dtg.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::DynamicNodeInvocation"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::Realm::Processor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "::FlexFlow::DeviceSpecificPerDeviceOpState *"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
new file mode 100644
index 0000000000..3a187924c8
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
@@ -0,0 +1,30 @@
+namespace = "FlexFlow"
+name = "SerializableDeviceHandleInitTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+  "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
+]
+
+[[fields]]
+name = "workSpaceSize"
+type = "size_t"
+
+[[fields]]
+name = "allowTensorOpMathConversion"
+type = "bool"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::SerializableRealmProcessor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "uintptr_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
new file mode 100644
index 0000000000..b239221c16
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_H
+
+#include "realm-execution/tasks/impl/device_handle_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDeviceHandleInitTaskArgs
+    device_handle_init_task_args_to_serializable(
+        DeviceHandleInitTaskArgs const &);
+DeviceHandleInitTaskArgs device_handle_init_task_args_from_serializable(
+    SerializableDeviceHandleInitTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
new file mode 100644
index 0000000000..68076b7d70
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
@@ -0,0 +1,48 @@
+namespace = "FlexFlow"
+name = "SerializableDeviceStateInitTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/realm.h",
+  "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
+  "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
+  "task-spec/device_specific_per_device_op_state.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::SerializableDynamicNodeInvocation"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::SerializableDeviceSpecificPtr"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::SerializableRealmProcessor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "uintptr_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
new file mode 100644
index 0000000000..2467f2067c
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_H
+
+#include "realm-execution/tasks/impl/device_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
+    DeviceStateInitTaskArgs const &);
+DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
+    SerializableDeviceStateInitTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml
new file mode 100644
index 0000000000..07cf61f7e1
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "SerializableDeviceSpecificPtr"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "pcg/device_id_t.dtg.h",
+  "cstdint",
+  "optional",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "device_idx"
+type = "::FlexFlow::device_id_t"
+
+[[fields]]
+name = "ptr"
+type = "std::optional<uintptr_t>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h b/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
index fc5abba587..3208368d2d 100644
--- a/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
@@ -9,8 +9,7 @@ namespace FlexFlow {
 
 template <typename T>
 std::string serialize_task_args(T const &args) {
-  nlohmann::json j;
-  args.serialize(j);
+  nlohmann::json j = args;
   return j.dump();
 }
 
@@ -18,7 +17,7 @@ template <typename T>
 T deserialize_task_args(void const *args, size_t arglen) {
   nlohmann::json j = nlohmann::json::parse(
       std::string_view{reinterpret_cast<char const *>(args), arglen});
-  return T::deserialize(j);
+  return j.get<T>();
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
index ea0782fd4b..6e0cef0bb2 100644
--- a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -16,25 +16,25 @@ std::optional<ManagedPerDeviceFFHandle *>
   return this->handle;
 }
 
-void DeviceSpecificManagedPerDeviceFFHandle::serialize(
-    nlohmann::json &j) const {
-  j = {
-      {"owner", owner},
-      {"handle",
-       transform(handle,
-                 [](ManagedPerDeviceFFHandle *ptr) {
-                   return reinterpret_cast<uintptr_t>(ptr);
-                 })},
+SerializableDeviceSpecificPtr
+    DeviceSpecificManagedPerDeviceFFHandle::serialize() const {
+  return SerializableDeviceSpecificPtr{
+      /*device_idx=*/owner,
+      /*ptr=*/
+      transform(handle,
+                [](ManagedPerDeviceFFHandle *ptr) {
+                  return reinterpret_cast<uintptr_t>(ptr);
+                }),
   };
 }
 
 DeviceSpecificManagedPerDeviceFFHandle
     DeviceSpecificManagedPerDeviceFFHandle::deserialize(
-        nlohmann::json const &j) {
+        SerializableDeviceSpecificPtr const &handle) {
   return DeviceSpecificManagedPerDeviceFFHandle{
-      /*owner=*/j.at("owner").get<device_id_t>(),
+      /*owner=*/handle.device_idx,
       /*handle=*/
-      transform(j.at("handle").get<std::optional<uintptr_t>>(),
+      transform(handle.ptr,
                 [](uintptr_t ptrval) {
                   return reinterpret_cast<ManagedPerDeviceFFHandle *>(ptrval);
                 }),
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 199f2dc090..8e6ab022aa 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -110,6 +110,7 @@ PCGInstance create_pcg_instance(
   // TODO list:
   //  * external instances
   //  * task argument serializer
+  //  * pass instances to task and convert to tensor accessor
   //  * copies
   //  * parallel operator implementation (partition, reduce, gather, etc.)
   //  * and fused parallel operators (reduce + broadcast = allreduce)
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
index cd5608ca7e..5cd53ea062 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
@@ -1,33 +1,14 @@
 #include "realm-execution/tasks/impl/device_handle_init_task.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/tasks/impl/device_handle_init_return_task.h"
+#include "realm-execution/tasks/impl/device_handle_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include <type_traits>
 
 namespace FlexFlow {
 
-// TODO: at some point we're going to have to actually serialize these, but for
-// now just pass the pointer and assume we're running inside a single address
-// space
-struct DeviceHandleInitTaskArgs {
-  DeviceHandleInitTaskArgs() = delete;
-  DeviceHandleInitTaskArgs(
-      size_t workSpaceSize,
-      bool allowTensorOpMathConversion,
-      Realm::Processor origin_proc,
-      DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr)
-      : workSpaceSize(workSpaceSize),
-        allowTensorOpMathConversion(allowTensorOpMathConversion),
-        origin_proc(origin_proc), origin_result_ptr(origin_result_ptr) {}
-
-public:
-  size_t workSpaceSize;
-  bool allowTensorOpMathConversion;
-  Realm::Processor origin_proc;
-  DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr;
-};
-static_assert(std::is_trivially_copy_constructible_v<DeviceHandleInitTaskArgs>);
-
 static std::optional<ManagedPerDeviceFFHandle *>
     make_device_handle_for_processor(Realm::Processor processor,
                                      size_t workSpaceSize,
@@ -52,12 +33,10 @@ void device_handle_init_task_body(void const *args,
                                   void const *userdata,
                                   size_t userlen,
                                   Realm::Processor proc) {
-  ASSERT(arglen == sizeof(DeviceHandleInitTaskArgs));
   DeviceHandleInitTaskArgs task_args =
-      *reinterpret_cast<DeviceHandleInitTaskArgs const *>(args);
-
-  // FIXME: serialize instead of passing pointers around
-  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+      device_handle_init_task_args_from_serializable(
+          deserialize_task_args<SerializableDeviceHandleInitTaskArgs>(args,
+                                                                      arglen));
 
   RealmContext ctx{proc};
   DeviceSpecificManagedPerDeviceFFHandle managed_handle =
@@ -89,6 +68,8 @@ Realm::Event spawn_device_handle_init_task(
       result_ptr,
   };
 
+  std::string args = serialize_task_args(
+      device_handle_init_task_args_to_serializable(task_args));
   return ctx.spawn_task(target_proc,
                         task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
                         &task_args,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
index 312c3f2401..99c72cf5e7 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -1,95 +1,26 @@
 #include "realm-execution/tasks/impl/device_state_init_task.h"
-#include "kernels/device_handle_t.dtg.h"
 #include "local-execution/device_state_initialization.h"
-#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/tasks/impl/device_state_init_return_task.h"
-#include "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h"
-#include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+#include "realm-execution/tasks/impl/device_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
 #include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
-#include "task-spec/device_specific_per_device_op_state.dtg.h"
-#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
-#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
 #include "utils/optional.h"
-#include <cstdint>
 #include <optional>
 #include <type_traits>
 
 namespace FlexFlow {
 
-// TODO: at some point we're going to have to actually serialize these, but for
-// now just pass the pointer and assume we're running inside a single address
-// space
-struct DeviceStateInitTaskArgs {
-  DeviceStateInitTaskArgs() = delete;
-  DeviceStateInitTaskArgs(
-      DynamicNodeInvocation const &invocation,
-      ProfilingSettings const &profiling_settings,
-      DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-      FFIterationConfig const &iteration_config,
-      OptimizerAttrs const &optimizer_attrs,
-      Realm::Processor origin_proc,
-      DeviceSpecificPerDeviceOpState *origin_result_ptr)
-      : invocation(invocation), profiling_settings(profiling_settings),
-        device_handle(device_handle), iteration_config(iteration_config),
-        optimizer_attrs(optimizer_attrs), origin_proc(origin_proc),
-        origin_result_ptr(origin_result_ptr) {}
-
-  void serialize(nlohmann::json &j) const {
-    nlohmann::json j_device_handle;
-    device_handle.serialize(j_device_handle);
-    j = {
-        {"invocation", dynamic_node_invocation_to_serializable(invocation)},
-        {"profiling_settings", profiling_settings},
-        {"device_handle", j_device_handle},
-        {"iteration_config", iteration_config},
-        {"optimizer_attrs", optimizer_attrs},
-        {"origin_proc", realm_processor_to_serializable(origin_proc)},
-        {"origin_result_ptr", reinterpret_cast<uintptr_t>(origin_result_ptr)},
-    };
-  }
-
-  static DeviceStateInitTaskArgs deserialize(nlohmann::json const &j) {
-    return DeviceStateInitTaskArgs{
-        /*invocation=*/dynamic_node_invocation_from_serializable(
-            j.at("invocation").get<SerializableDynamicNodeInvocation>()),
-        /*profiling_settings=*/
-        j.at("profiling_settings").get<ProfilingSettings>(),
-        /*device_handle=*/
-        DeviceSpecificManagedPerDeviceFFHandle::deserialize(
-            j.at("device_handle")),
-        /*iteration_config=*/j.at("iteration_config").get<FFIterationConfig>(),
-        /*optimizer_attrs=*/j.at("optimizer_attrs").get<OptimizerAttrs>(),
-        /*origin_proc=*/
-        realm_processor_from_serializable(
-            j.at("origin_proc").get<SerializableRealmProcessor>()),
-        /*origin_result_ptr=*/
-        reinterpret_cast<DeviceSpecificPerDeviceOpState *>(
-            j.at("origin_result_ptr").get<uintptr_t>()),
-    };
-  }
-
-public:
-  DynamicNodeInvocation invocation;
-  ProfilingSettings profiling_settings;
-  DeviceSpecificManagedPerDeviceFFHandle device_handle;
-  FFIterationConfig iteration_config;
-  OptimizerAttrs optimizer_attrs;
-  Realm::Processor origin_proc;
-  DeviceSpecificPerDeviceOpState *origin_result_ptr;
-};
-
 void device_state_init_task_body(void const *args,
                                  size_t arglen,
                                  void const *userdata,
                                  size_t userlen,
                                  Realm::Processor proc) {
   DeviceStateInitTaskArgs task_args =
-      deserialize_task_args<DeviceStateInitTaskArgs>(args, arglen);
-
-  // FIXME: serialize instead of passing pointers around
-  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+      device_state_init_task_args_from_serializable(
+          deserialize_task_args<SerializableDeviceStateInitTaskArgs>(args,
+                                                                     arglen));
 
   RealmContext ctx{proc};
   device_handle_t device_handle =
@@ -143,7 +74,8 @@ std::optional<Realm::Event> spawn_device_state_init_task(
                         }),
                get_init_task_id_for_op_attrs);
   if (task_id.has_value()) {
-    std::string args = serialize_task_args(task_args);
+    std::string args = serialize_task_args(
+        device_state_init_task_args_to_serializable(task_args));
     return ctx.spawn_task(target_proc,
                           assert_unwrap(task_id),
                           args.data(),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc
new file mode 100644
index 0000000000..a44a5a5db1
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc
@@ -0,0 +1,28 @@
+#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.h"
+
+namespace FlexFlow {
+
+SerializableDeviceHandleInitTaskArgs
+    device_handle_init_task_args_to_serializable(
+        DeviceHandleInitTaskArgs const &args) {
+  return SerializableDeviceHandleInitTaskArgs{
+      /*workSpaceSize=*/args.workSpaceSize,
+      /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
+      /*origin_proc=*/realm_processor_to_serializable(args.origin_proc),
+      /*origin_result_ptr=*/reinterpret_cast<uintptr_t>(args.origin_result_ptr),
+  };
+}
+
+DeviceHandleInitTaskArgs device_handle_init_task_args_from_serializable(
+    SerializableDeviceHandleInitTaskArgs const &args) {
+  return DeviceHandleInitTaskArgs{
+      /*workSpaceSize=*/args.workSpaceSize,
+      /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
+      /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
+      /*origin_result_ptr=*/
+      reinterpret_cast<DeviceSpecificManagedPerDeviceFFHandle *>(
+          args.origin_result_ptr),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
new file mode 100644
index 0000000000..528ff26867
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
@@ -0,0 +1,36 @@
+#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+
+namespace FlexFlow {
+
+SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
+    DeviceStateInitTaskArgs const &args) {
+  return SerializableDeviceStateInitTaskArgs{
+      /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/args.device_handle.serialize(),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+      /*origin_proc=*/realm_processor_to_serializable(args.origin_proc),
+      /*origin_result_ptr=*/reinterpret_cast<uintptr_t>(args.origin_result_ptr),
+  };
+}
+
+DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
+    SerializableDeviceStateInitTaskArgs const &args) {
+  return DeviceStateInitTaskArgs{
+      /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/
+      DeviceSpecificManagedPerDeviceFFHandle::deserialize(args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+      /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
+      /*origin_result_ptr=*/
+      reinterpret_cast<DeviceSpecificPerDeviceOpState *>(
+          args.origin_result_ptr),
+  };
+}
+
+} // namespace FlexFlow

From 5a1c83083b9074e969d6701dca0c366b30707d6d Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 14 Feb 2026 15:13:38 -0800
Subject: [PATCH 060/113] Convert op task args.

---
 .../tasks/impl/op_task_args.dtg.toml          | 32 ++++++++++
 ...able_device_handle_init_task_args.dtg.toml |  1 -
 ...erializable_device_handle_init_task_args.h |  4 +-
 ...zable_device_state_init_task_args.dtg.toml |  1 -
 ...serializable_device_state_init_task_args.h |  4 +-
 .../impl/serializable_op_task_args.dtg.toml   | 42 +++++++++++++
 .../tasks/impl/serializable_op_task_args.h    | 14 +++++
 .../tasks/impl/device_handle_init_task.cc     |  4 +-
 .../src/realm-execution/tasks/impl/op_task.cc | 60 ++++++-------------
 .../tasks/impl/serializable_op_task_args.cc   | 27 +++++++++
 10 files changed, 139 insertions(+), 50 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc

diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
new file mode 100644
index 0000000000..814f9f802b
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
@@ -0,0 +1,32 @@
+namespace = "FlexFlow"
+name = "OpTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::DynamicNodeInvocation"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "std::optional<::FlexFlow::OptimizerAttrs>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
index 3a187924c8..34f52880f8 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
@@ -9,7 +9,6 @@ features = [
 ]
 
 includes = [
-  "realm-execution/realm.h",
   "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
 ]
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
index b239221c16..63d70fe10a 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
 
 #include "realm-execution/tasks/impl/device_handle_init_task_args.dtg.h"
 #include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.h"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
index 68076b7d70..c99d2758c0 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
@@ -11,7 +11,6 @@ features = [
 includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
-  "realm-execution/realm.h",
   "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
   "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
   "task-spec/device_specific_per_device_op_state.dtg.h",
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
index 2467f2067c..f028820974 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_ARGS_H
 
 #include "realm-execution/tasks/impl/device_state_init_task_args.dtg.h"
 #include "realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.h"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
new file mode 100644
index 0000000000..a0f89e3ae2
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
@@ -0,0 +1,42 @@
+namespace = "FlexFlow"
+name = "SerializableOpTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::SerializableDynamicNodeInvocation"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::SerializableDeviceSpecificPtr"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "std::optional<::FlexFlow::OptimizerAttrs>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h
new file mode 100644
index 0000000000..3b2d05d0b6
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_OP_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_OP_TASK_ARGS_H
+
+#include "realm-execution/tasks/impl/op_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_op_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &);
+OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
index 5cd53ea062..b806aa1277 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
@@ -72,8 +72,8 @@ Realm::Event spawn_device_handle_init_task(
       device_handle_init_task_args_to_serializable(task_args));
   return ctx.spawn_task(target_proc,
                         task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
-                        &task_args,
-                        sizeof(task_args),
+                        args.data(),
+                        args.size(),
                         Realm::ProfilingRequestSet{},
                         precondition);
 }
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index e17973febb..d8b8873442 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -1,6 +1,9 @@
 #include "realm-execution/tasks/impl/op_task.h"
 #include "local-execution/task_execution.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/tasks/impl/op_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_op_task_args.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "task-spec/per_device_op_state.h"
 #include "utils/optional.h"
@@ -8,59 +11,31 @@
 
 namespace FlexFlow {
 
-// TODO: at some point we're going to have to actually serialize these, but for
-// now just pass the pointer and assume we're running inside a single address
-// space
-struct OpTaskArgs {
-public:
-  OpTaskArgs() = delete;
-  OpTaskArgs(DynamicNodeInvocation const *invocation,
-             ProfilingSettings const *profiling_settings,
-             DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-             FFIterationConfig const *iteration_config,
-             std::optional<OptimizerAttrs> const *optimizer_attrs,
-             Realm::Processor origin_proc)
-      : invocation(invocation), profiling_settings(profiling_settings),
-        device_handle(device_handle), iteration_config(iteration_config),
-        optimizer_attrs(optimizer_attrs) {}
-
-public:
-  DynamicNodeInvocation const *invocation;
-  ProfilingSettings const *profiling_settings;
-  DeviceSpecificManagedPerDeviceFFHandle device_handle;
-  FFIterationConfig const *iteration_config;
-  std::optional<OptimizerAttrs> const *optimizer_attrs;
-  Realm::Processor origin_proc;
-};
-
 void op_task_body(void const *args,
                   size_t arglen,
                   void const *userdata,
                   size_t userlen,
                   Realm::Processor proc) {
-  ASSERT(arglen == sizeof(OpTaskArgs));
-  OpTaskArgs task_args = *reinterpret_cast<OpTaskArgs const *>(args);
-
-  // FIXME: serialize instead of passing pointers around
-  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+  OpTaskArgs task_args = op_task_args_from_serializable(
+      deserialize_task_args<SerializableOpTaskArgs>(args, arglen));
 
   RealmContext ctx{proc};
   device_handle_t device_handle =
       device_handle_t_from_device_specific_managed_handle(
           task_args.device_handle, ctx.get_current_device_idx());
   execute_dynamic_node_invocation(
-      /*invocation=*/*task_args.invocation,
+      /*invocation=*/task_args.invocation,
       /*allocator=*/ctx.get_current_device_allocator(),
-      /*profiling_settings=*/*task_args.profiling_settings,
+      /*profiling_settings=*/task_args.profiling_settings,
       /*ff_handle=*/device_handle,
       /*per_device_op_state=*/
-      transform(task_args.invocation->node_attrs.per_device_op_state,
+      transform(task_args.invocation.node_attrs.per_device_op_state,
                 [&](DeviceSpecificPerDeviceOpState const &op_state) {
                   return get_device_state_from_device_specific(
                       op_state, ctx.get_current_device_idx());
                 }),
-      /*iteration_config=*/*task_args.iteration_config,
-      /*optimizer_attrs=*/*task_args.optimizer_attrs,
+      /*iteration_config=*/task_args.iteration_config,
+      /*optimizer_attrs=*/task_args.optimizer_attrs,
       /*device_idx=*/ctx.get_current_device_idx());
 }
 
@@ -73,17 +48,18 @@ Realm::Event
                   FFIterationConfig const &iteration_config,
                   std::optional<OptimizerAttrs> const &optimizer_attrs,
                   Realm::Event precondition) {
-  OpTaskArgs task_args{&invocation,
-                       &profiling_settings,
+  OpTaskArgs task_args{invocation,
+                       profiling_settings,
                        device_handle,
-                       &iteration_config,
-                       &optimizer_attrs,
-                       ctx.get_current_processor()};
+                       iteration_config,
+                       optimizer_attrs};
+  std::string args =
+      serialize_task_args(op_task_args_to_serializable(task_args));
   return ctx.spawn_task(
       target_proc,
       assert_unwrap(get_task_id_for_op(invocation.node_attrs, optimizer_attrs)),
-      &task_args,
-      sizeof(task_args),
+      args.data(),
+      args.size(),
       Realm::ProfilingRequestSet{},
       precondition);
 }
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
new file mode 100644
index 0000000000..0513bc6df7
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
@@ -0,0 +1,27 @@
+#include "realm-execution/tasks/impl/serializable_op_task_args.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+
+namespace FlexFlow {
+
+SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
+  return SerializableOpTaskArgs{
+      /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/args.device_handle.serialize(),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+  };
+}
+
+OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &args) {
+  return OpTaskArgs{
+      /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/
+      DeviceSpecificManagedPerDeviceFFHandle::deserialize(args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+  };
+}
+
+} // namespace FlexFlow

From 07ed0c8c598794e2657eac4911fa2251e50ea883 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 14 Feb 2026 17:37:07 -0800
Subject: [PATCH 061/113] Map the PCG for test.

---
 .../test/src/realm-execution/test_e2e.cc      | 44 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 33ad2bbbc1..8e5edf72ad 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -1,7 +1,12 @@
 #include "internal/realm_test_utils.h"
 #include "kernels/allocation.h"
 #include "op-attrs/tensor_shape.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "realm-execution/distributed_device_handle.h"
 #include "realm-execution/pcg_instance/pcg_instance.h"
@@ -126,7 +131,44 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t_linear_2 =
           require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
 
-      MappedParallelComputationGraph mpcg{pcg, {}};
+      MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+      ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+      MappedParallelComputationGraph mpcg{
+          pcg,
+          {
+              {inputs_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {linear_operator_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+              {linear_operator_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+          },
+      };
 
       // instantiate computation graph
       LossAttrs loss_attrs = LossAttrs{

From 59034b1e73c94e08fee3b6b685edc1787a0efa2a Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 14 Feb 2026 17:43:29 -0800
Subject: [PATCH 062/113] Fix a bug in shard expansion.

---
 lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
index 33b7fb8591..402e0ef055 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -15,7 +15,7 @@ bool value_is_shard_expanded(DynamicValueAttrs const &n) {
 
 bool no_part_of_graph_is_shard_expanded(DynamicOpenDataflowGraph const &g) {
   auto slot_is_shard_expanded = [](DynamicTensorSlot const &) -> bool {
-    return true;
+    return false;
   };
 
   return no_part_of_dynamic_graph_satisfies(g,

From 01cee4cd4c2595e359a7b1684ea70854f604275a Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Sat, 14 Feb 2026 17:53:00 -0800
Subject: [PATCH 063/113] Finish body of instance allocation.

---
 .../src/realm-execution/instance_allocation.cc        | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index c033f0bac1..b740859e22 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -5,6 +5,7 @@
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
@@ -14,6 +15,7 @@
 #include "utils/containers/make.h"
 #include "utils/containers/map_values.h"
 #include "utils/containers/unordered_set_of.h"
+#include "utils/containers/values.h"
 #include "utils/exception.h"
 #include "utils/optional.h"
 
@@ -59,6 +61,15 @@ TensorInstanceBacking perform_instance_allocation(
     }
   };
 
+  for (DynamicNodeInvocation const &invocation : g.invocations) {
+    for (DynamicValueAttrs const &input : values(invocation.inputs)) {
+      allocate(invocation.node_attrs, input);
+    }
+    for (DynamicValueAttrs const &output : values(invocation.outputs)) {
+      allocate(invocation.node_attrs, output);
+    }
+  }
+
   return result;
 }
 

From 99f4d9718725989e353f3bbd033d76474766855e Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 09:59:14 -0800
Subject: [PATCH 064/113] Fix some bugs in loss insertion, instance allocation.

---
 .../computation_graph_instance.cc                      |  3 ++-
 .../realm-execution/pcg_instance/pcg_instance.h        |  1 +
 .../src/realm-execution/instance_allocation.cc         |  5 ++---
 .../src/realm-execution/pcg_instance/pcg_instance.cc   |  4 +++-
 .../test/src/realm-execution/test_e2e.cc               |  7 +++++++
 .../include/task-spec/dynamic_graph/loss_insertion.h   |  9 ++++++---
 .../src/task-spec/dynamic_graph/loss_insertion.cc      | 10 ++++++----
 7 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
index e251fafe5f..40d9b187c4 100644
--- a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
+++ b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
@@ -81,7 +81,8 @@ ComputationGraphInstance create_computation_graph_instance(
     auto [loss_inserted_dg, label_v, logit_grad_v] = perform_loss_insertion(
         dg,
         assert_unwrap(loss_attrs),
-        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)});
+        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)},
+        std::nullopt);
     dg = loss_inserted_dg;
     logit_grad_value = logit_grad_v;
     inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
index b0037f51b2..fa163d1419 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -51,6 +51,7 @@ PCGInstance create_pcg_instance(
     std::optional<LossAttrs> const &loss_attrs,
     std::optional<GenericTensorAccessorR> label_tensor,
     std::optional<parallel_tensor_guid_t> logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index b740859e22..797455573c 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -52,12 +52,11 @@ TensorInstanceBacking perform_instance_allocation(
       // FIXME: Attach external instance to existing allocation and use that
       NOT_IMPLEMENTED();
     } else {
-      if (contains_key(result.backing, v)) {
-        return result.backing.at(v);
-      } else {
+      if (!contains_key(result.backing, v)) {
         result.backing.insert(
             std::pair{v, perform_instance_allocation_for_value(n, v, ctx)});
       }
+      return result.backing.at(v);
     }
   };
 
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 8e6ab022aa..7b047bcb72 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -54,6 +54,7 @@ PCGInstance create_pcg_instance(
     std::optional<LossAttrs> const &loss_attrs,
     std::optional<GenericTensorAccessorR> label_tensor,
     std::optional<parallel_tensor_guid_t> logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
@@ -71,7 +72,8 @@ PCGInstance create_pcg_instance(
     auto [dg2, label_v, logit_grad_v] = perform_loss_insertion(
         dg,
         assert_unwrap(loss_attrs),
-        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)});
+        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)},
+        loss_mapping);
     dg = dg2;
     logit_grad_value = logit_grad_v;
     inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 8e5edf72ad..4dbfe09045 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -169,6 +169,12 @@ TEST_SUITE(FF_TEST_SUITE) {
                      }}}}}},
           },
       };
+      MappedOperatorTaskGroup loss_mapping{
+          {{cpu0,
+            OperatorAtomicTaskShardBinding{{
+                {TensorSlotName::INPUT, tensor_coord0},
+                {TensorSlotName::LOGIT, tensor_coord0},
+            }}}}};
 
       // instantiate computation graph
       LossAttrs loss_attrs = LossAttrs{
@@ -194,6 +200,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*loss=*/loss_attrs,
           /*label_tensor=*/label_tensor,
           /*logit_tensor=*/t_linear_2,
+          /*loss_mapping=*/loss_mapping,
           /*input_tensors=*/input_tensors,
           /*profiling_settings=*/ProfilingSettings{0, 0},
           /*device_handle=*/device_handle,
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
index c7cef3f06f..b3b2a465f8 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
+++ b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
@@ -6,12 +6,15 @@
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion_result.dtg.h"
+#include <optional>
 
 namespace FlexFlow {
 
-LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
-                                           LossAttrs const &loss_attrs,
-                                           dynamic_tensor_guid_t logit_tensor);
+LossInsertionResult perform_loss_insertion(
+    DynamicOpenDataflowGraph const &dg,
+    LossAttrs const &loss_attrs,
+    dynamic_tensor_guid_t logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping);
 
 } // namespace FlexFlow
 
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
index 4270119612..857fed1a84 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
@@ -12,9 +12,11 @@
 
 namespace FlexFlow {
 
-LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
-                                           LossAttrs const &loss_attrs,
-                                           dynamic_tensor_guid_t logit_tensor) {
+LossInsertionResult perform_loss_insertion(
+    DynamicOpenDataflowGraph const &dg,
+    LossAttrs const &loss_attrs,
+    dynamic_tensor_guid_t logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping) {
   DynamicValueAttrs logit_value = assert_unwrap(
       find_output_value_attrs(dg, logit_tensor, mk_dynamic_tensor_role_fwd()));
 
@@ -45,7 +47,7 @@ LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
       DynamicNodeAttrs{
           /*task_type=*/DynamicTaskType::LOSS,
           /*device_coord=*/std::nullopt,
-          /*mapping=*/std::nullopt,
+          /*mapping=*/loss_mapping,
           /*op_attrs=*/TrainingOperationAttrs{loss_attrs},
           /*layer_guid=*/mk_dynamic_layer_guid_for_loss(),
           /*per_device_op_state=*/std::nullopt,

From e9f327abeb6ebf94acdb87920a7bf669b6e13aa7 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 10:53:26 -0800
Subject: [PATCH 065/113] Fixes for PCG initialization.

---
 .../include/realm-execution/fmt/realm_event.h | 35 +++++++++++++++++++
 .../fmt/{instance.h => realm_instance.h}      |  4 +--
 .../tensor_instance_backing.dtg.toml          |  5 +--
 .../src/realm-execution/fmt/realm_event.cc    | 10 ++++++
 .../fmt/{instance.cc => realm_instance.cc}    |  2 +-
 .../pcg_instance/pcg_instance.cc              | 17 +++++++++
 .../tasks/realm_task_registry.cc              |  2 ++
 7 files changed, 70 insertions(+), 5 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/fmt/realm_event.h
 rename lib/realm-execution/include/realm-execution/fmt/{instance.h => realm_instance.h} (96%)
 create mode 100644 lib/realm-execution/src/realm-execution/fmt/realm_event.cc
 rename lib/realm-execution/src/realm-execution/fmt/{instance.cc => realm_instance.cc} (80%)

diff --git a/lib/realm-execution/include/realm-execution/fmt/realm_event.h b/lib/realm-execution/include/realm-execution/fmt/realm_event.h
new file mode 100644
index 0000000000..a7df28ced6
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/fmt/realm_event.h
@@ -0,0 +1,35 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "utils/check_fmtable.h"
+#include <fmt/format.h>
+#include <utility>
+
+namespace fmt {
+
+template <typename Char>
+struct formatter<::FlexFlow::Realm::Event,
+                 Char,
+                 std::enable_if_t<!detail::has_format_as<
+                     ::FlexFlow::Realm::Event>::value>>
+    : formatter<::std::string> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Realm::Event const &m, FormatContext &ctx)
+      -> decltype(ctx.out()) {
+    std::string result = fmt::format("<Event {}>", m.id);
+
+    return formatter<std::string>::format(result, ctx);
+  }
+};
+
+} // namespace fmt
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s,
+                         ::FlexFlow::Realm::Event const &m);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/fmt/instance.h b/lib/realm-execution/include/realm-execution/fmt/realm_instance.h
similarity index 96%
rename from lib/realm-execution/include/realm-execution/fmt/instance.h
rename to lib/realm-execution/include/realm-execution/fmt/realm_instance.h
index c7c2df6735..e6d2846c1f 100644
--- a/lib/realm-execution/include/realm-execution/fmt/instance.h
+++ b/lib/realm-execution/include/realm-execution/fmt/realm_instance.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_REALM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_REALM_INSTANCE_H
 
 #include "realm-execution/realm.h"
 #include "utils/check_fmtable.h"
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
index e6a8bd58d9..6c43990282 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -3,7 +3,7 @@ name = "TensorInstanceBacking"
 type = "struct"
 features = [
   "eq",
-  #"fmt",
+  "fmt",
   #"hash",
 ]
 
@@ -14,7 +14,8 @@ includes = [
 ]
 
 src_includes = [
-  "realm-execution/fmt/instance.h",
+  "realm-execution/fmt/realm_event.h",
+  "realm-execution/fmt/realm_instance.h",
   "utils/hash/unordered_map.h",
   "utils/fmt/unordered_map.h",
 ]
diff --git a/lib/realm-execution/src/realm-execution/fmt/realm_event.cc b/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
new file mode 100644
index 0000000000..7c5ad7d848
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
@@ -0,0 +1,10 @@
+#include "realm-execution/fmt/realm_event.h"
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s,
+                         ::FlexFlow::Realm::Event const &m) {
+  return s << fmt::to_string(m);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/fmt/instance.cc b/lib/realm-execution/src/realm-execution/fmt/realm_instance.cc
similarity index 80%
rename from lib/realm-execution/src/realm-execution/fmt/instance.cc
rename to lib/realm-execution/src/realm-execution/fmt/realm_instance.cc
index f8eabe9bb0..301954f824 100644
--- a/lib/realm-execution/src/realm-execution/fmt/instance.cc
+++ b/lib/realm-execution/src/realm-execution/fmt/realm_instance.cc
@@ -1,4 +1,4 @@
-#include "realm-execution/fmt/instance.h"
+#include "realm-execution/fmt/realm_instance.h"
 
 namespace FlexFlow {
 
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 7b047bcb72..c21737300c 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -1,11 +1,14 @@
 #include "realm-execution/pcg_instance/pcg_instance.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
 #include "pcg/optimizer_attrs.h"
 #include "realm-execution/dependency_set.h"
 #include "realm-execution/distributed_device_state_initialization.h"
 #include "realm-execution/instance_allocation.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tasks/impl/op_task.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_task_type.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion.h"
@@ -83,6 +86,20 @@ PCGInstance create_pcg_instance(
   dg = perform_shard_expansion(dg);
   TensorInstanceBacking backing = perform_instance_allocation(dg, inputs, ctx);
 
+  logit_grad_value = transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
+    for (DynamicNodeInvocation const &invocation : dg.invocations) {
+      if (invocation.node_attrs.task_type != DynamicTaskType::LOSS) {
+        continue;
+      }
+      for (auto const &[slot, value] : invocation.outputs) {
+        if (slot.slot_name == TensorSlotName::LOGIT && value.tensor_guid == lgv.tensor_guid && value.role == lgv.role) {
+          return value;
+        }
+      }
+    }
+    PANIC("couldn't find updated logit grad in the shard-expanded dynamic graph");
+  });
+
   std::optional<Realm::RegionInstance> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
         return backing.backing.at(lgv).first;
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index cff12c2391..914e8d1e29 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -49,6 +49,8 @@ Realm::Event register_all_tasks() {
   };
 
   for (task_id_t task_id : init_task_ids) {
+    pending_registrations.push_back(register_task(
+        Realm::Processor::LOC_PROC, task_id, device_state_init_task_body));
     pending_registrations.push_back(register_task(
         Realm::Processor::TOC_PROC, task_id, device_state_init_task_body));
   }

From 4f7add76dab3751f895a1ae33ba778910a3b1ba0 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 11:12:26 -0800
Subject: [PATCH 066/113] Fix a bug in device state handling.

---
 .../device_specific_managed_per_device_ff_handle.cc             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
index 6e0cef0bb2..bcc0a22ccf 100644
--- a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -51,7 +51,7 @@ device_handle_t device_handle_t_from_device_specific_managed_handle(
     DeviceSpecificManagedPerDeviceFFHandle const &device_specific,
     device_id_t device_idx) {
   return device_handle_t_from_managed_handle_ptr(
-      *device_specific.get(device_idx));
+      device_specific.get(device_idx));
 }
 
 } // namespace FlexFlow

From a316360f7a282f219b54ed81a4b7a8f309cb14a7 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 12:27:19 -0800
Subject: [PATCH 067/113] Implement most of tensor backing in task.

---
 .../distributed_device_state_initialization.h |  4 +-
 .../dynamic_tensor_accessor_from_instance.h   | 14 +++
 .../include/realm-execution/fmt/realm_event.h | 11 +--
 .../include/realm-execution/hash/processor.h  |  4 +
 .../pcg_instance/pcg_instance.h               | 12 ++-
 .../include/realm-execution/realm.h           |  2 +-
 .../tasks/impl/device_state_init_task.h       |  4 +
 .../impl/device_state_init_task_args.dtg.toml |  5 +
 .../realm-execution/tasks/impl/op_task.h      | 20 ++--
 .../tasks/impl/op_task_args.dtg.toml          |  6 ++
 ...zable_device_state_init_task_args.dtg.toml |  5 +
 .../impl/serializable_op_task_args.dtg.toml   |  5 +
 .../serializable_realm_instance.dtg.toml      | 17 ++++
 .../serializer/serializable_realm_instance.h  | 16 +++
 .../realm-execution/tensor_instance_backing.h |  4 +
 ...distributed_device_state_initialization.cc | 16 ++-
 .../dynamic_tensor_accessor_from_instance.cc  | 11 +++
 .../src/realm-execution/fmt/realm_event.cc    |  3 +-
 .../src/realm-execution/hash/processor.cc     |  4 +
 .../pcg_instance/pcg_instance.cc              | 99 ++++++++++++-------
 .../tasks/impl/device_state_init_task.cc      | 21 +++-
 .../src/realm-execution/tasks/impl/op_task.cc | 37 +++++--
 ...erializable_device_state_init_task_args.cc | 11 +++
 .../tasks/impl/serializable_op_task_args.cc   | 11 +++
 .../serializer/serializable_realm_instance.cc | 15 +++
 .../tensor_instance_backing.cc                | 14 +++
 26 files changed, 303 insertions(+), 68 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h
 create mode 100644 lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc

diff --git a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
index ca24ecdd4c..e257834e65 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
@@ -5,14 +5,16 @@
 #include "pcg/optimizer_attrs.dtg.h"
 #include "realm-execution/distributed_device_handle.h"
 #include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/ff_iteration_config.dtg.h"
 
 namespace FlexFlow {
 
 DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
-    DynamicOpenDataflowGraph const &dg,
     RealmContext &ctx,
+    DynamicOpenDataflowGraph const &dg,
+    TensorInstanceBacking const &tensor_instance_backing,
     ProfilingSettings const &profiling_settings,
     DistributedDeviceHandle const &device_handle,
     FFIterationConfig const &iteration_config,
diff --git a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
new file mode 100644
index 0000000000..48cfbde924
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DYNAMIC_TENSOR_ACCESSOR_FROM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DYNAMIC_TENSOR_ACCESSOR_FROM_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+
+namespace FlexFlow {
+
+DynamicTensorAccessor
+    dynamic_tensor_accessor_from_instance(Realm::RegionInstance const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/fmt/realm_event.h b/lib/realm-execution/include/realm-execution/fmt/realm_event.h
index a7df28ced6..a245968f39 100644
--- a/lib/realm-execution/include/realm-execution/fmt/realm_event.h
+++ b/lib/realm-execution/include/realm-execution/fmt/realm_event.h
@@ -9,10 +9,10 @@
 namespace fmt {
 
 template <typename Char>
-struct formatter<::FlexFlow::Realm::Event,
-                 Char,
-                 std::enable_if_t<!detail::has_format_as<
-                     ::FlexFlow::Realm::Event>::value>>
+struct formatter<
+    ::FlexFlow::Realm::Event,
+    Char,
+    std::enable_if_t<!detail::has_format_as<::FlexFlow::Realm::Event>::value>>
     : formatter<::std::string> {
   template <typename FormatContext>
   auto format(::FlexFlow::Realm::Event const &m, FormatContext &ctx)
@@ -27,8 +27,7 @@ struct formatter<::FlexFlow::Realm::Event,
 
 namespace FlexFlow {
 
-std::ostream &operator<<(std::ostream &s,
-                         ::FlexFlow::Realm::Event const &m);
+std::ostream &operator<<(std::ostream &s, ::FlexFlow::Realm::Event const &m);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/hash/processor.h b/lib/realm-execution/include/realm-execution/hash/processor.h
index e5eb8eb503..efe6e6186b 100644
--- a/lib/realm-execution/include/realm-execution/hash/processor.h
+++ b/lib/realm-execution/include/realm-execution/hash/processor.h
@@ -4,6 +4,8 @@
 #include "realm-execution/realm.h"
 #include <utility>
 
+#ifdef FLEXFLOW_USE_PREALM
+
 namespace std {
 
 template <>
@@ -14,3 +16,5 @@ struct hash<::FlexFlow::Realm::Processor> {
 } // namespace std
 
 #endif
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
index fa163d1419..1238097b2a 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -12,6 +12,7 @@
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "realm-execution/distributed_device_handle.h"
 #include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
@@ -29,10 +30,12 @@ struct PCGInstance {
   explicit PCGInstance(
       RealmContext &ctx,
       std::vector<DynamicNodeInvocation> const &execution_order,
+      TensorInstanceBacking const &tensor_instance_backing,
       OptimizerAttrs const &optimizer_attrs,
       std::optional<Realm::RegionInstance> logit_grad_tensor);
   RealmContext &get_realm_context();
   std::vector<DynamicNodeInvocation> const &get_execution_order() const;
+  TensorInstanceBacking const &get_tensor_instance_backing() const;
   OptimizerAttrs const &get_optimizer_attrs() const;
   void update_optimizer_attrs_for_next_iter();
   std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
@@ -40,6 +43,7 @@ struct PCGInstance {
 private:
   RealmContext &ctx;
   std::vector<DynamicNodeInvocation> execution_order;
+  TensorInstanceBacking tensor_instance_backing;
   OptimizerAttrs optimizer_attrs;
   std::optional<Realm::RegionInstance> logit_grad_tensor;
 };
@@ -60,28 +64,28 @@ PCGInstance create_pcg_instance(
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_all_passes_for_pcg_instance(
-        PCGInstance &instance,
+        PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config);
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_forward_pass_for_pcg_instance(
-        PCGInstance &instance,
+        PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config);
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_backward_pass_for_pcg_instance(
-        PCGInstance &instance,
+        PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config);
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_update_pass_for_pcg_instance(
-        PCGInstance &instance,
+        PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config);
diff --git a/lib/realm-execution/include/realm-execution/realm.h b/lib/realm-execution/include/realm-execution/realm.h
index b6913e66f5..fe83e69583 100644
--- a/lib/realm-execution/include/realm-execution/realm.h
+++ b/lib/realm-execution/include/realm-execution/realm.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 
-#define FLEXFLOW_USE_PREALM
+// #define FLEXFLOW_USE_PREALM
 
 #ifdef FLEXFLOW_USE_PREALM
 #include <realm/prealm/prealm.h>
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
index 4ed8c1726d..9c53748916 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
@@ -8,7 +8,9 @@
 #include "realm-execution/realm_context.h"
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "task-spec/ff_iteration_config.dtg.h"
+#include <unordered_map>
 
 namespace FlexFlow {
 
@@ -19,6 +21,8 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
     DynamicNodeInvocation const &invocation,
+    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance> const
+        &tensor_backing,
     ProfilingSettings const &profiling_settings,
     DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
     FFIterationConfig const &iteration_config,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
index a9aa77dde9..888c62af54 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
@@ -10,6 +10,7 @@ includes = [
   "realm-execution/realm.h",
   "task-spec/device_specific_per_device_op_state.dtg.h",
   "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
 ]
 
@@ -17,6 +18,10 @@ includes = [
 name = "invocation"
 type = "::FlexFlow::DynamicNodeInvocation"
 
+[[fields]]
+name = "tensor_backing"
+type = "std::unordered_map<::FlexFlow::DynamicValueAttrs, ::FlexFlow::Realm::RegionInstance>"
+
 [[fields]]
 name = "profiling_settings"
 type = "::FlexFlow::ProfilingSettings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 9d4c2fd451..37a801a508 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -14,15 +14,17 @@ namespace FlexFlow {
 
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event
-    spawn_op_task(RealmContext &ctx,
-                  Realm::Processor target_proc,
-                  DynamicNodeInvocation const &invocation,
-                  ProfilingSettings const &profiling_settings,
-                  DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-                  FFIterationConfig const &iteration_config,
-                  std::optional<OptimizerAttrs> const &optimizer_attrs,
-                  Realm::Event precondition);
+Realm::Event spawn_op_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance> const
+        &tensor_backing,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    std::optional<OptimizerAttrs> const &optimizer_attrs,
+    Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
index 814f9f802b..84fa384d25 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
@@ -7,7 +7,9 @@ includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
   "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/realm.h",
   "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
 ]
 
@@ -15,6 +17,10 @@ includes = [
 name = "invocation"
 type = "::FlexFlow::DynamicNodeInvocation"
 
+[[fields]]
+name = "tensor_backing"
+type = "std::unordered_map<::FlexFlow::DynamicValueAttrs, ::FlexFlow::Realm::RegionInstance>"
+
 [[fields]]
 name = "profiling_settings"
 type = "::FlexFlow::ProfilingSettings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
index c99d2758c0..f3847c9137 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
@@ -12,6 +12,7 @@ includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
   "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
+  "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h",
   "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
   "task-spec/device_specific_per_device_op_state.dtg.h",
   "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
@@ -22,6 +23,10 @@ includes = [
 name = "invocation"
 type = "::FlexFlow::SerializableDynamicNodeInvocation"
 
+[[fields]]
+name = "tensor_backing"
+type = "std::unordered_map<::FlexFlow::SerializableDynamicValueAttrs, ::FlexFlow::SerializableRealmInstance>"
+
 [[fields]]
 name = "profiling_settings"
 type = "::FlexFlow::ProfilingSettings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
index a0f89e3ae2..3ca338689a 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
@@ -12,6 +12,7 @@ includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
   "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
+  "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h",
   "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
 ]
@@ -25,6 +26,10 @@ src_includes = [
 name = "invocation"
 type = "::FlexFlow::SerializableDynamicNodeInvocation"
 
+[[fields]]
+name = "tensor_backing"
+type = "std::unordered_map<::FlexFlow::SerializableDynamicValueAttrs, ::FlexFlow::SerializableRealmInstance>"
+
 [[fields]]
 name = "profiling_settings"
 type = "::FlexFlow::ProfilingSettings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
new file mode 100644
index 0000000000..150801367d
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SerializableRealmInstance"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+[[fields]]
+name = "id"
+type = "::FlexFlow::Realm::RegionInstance::id_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h
new file mode 100644
index 0000000000..7262ec4f09
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmInstance
+    realm_instance_to_serializable(Realm::RegionInstance const &);
+Realm::RegionInstance
+    realm_instance_from_serializable(SerializableRealmInstance const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.h b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
index 1d143b7409..72a8bf439a 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
@@ -2,11 +2,15 @@
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TENSOR_INSTANCE_BACKING_H
 
 #include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 
 namespace FlexFlow {
 
 TensorInstanceBacking make_empty_tensor_instance_backing();
 
+TensorInstanceBacking subset_tensor_instance_backing_for_invocation(
+    TensorInstanceBacking const &, DynamicNodeInvocation const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
index cab2b49e15..de8060aa12 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -1,8 +1,11 @@
 #include "realm-execution/distributed_device_state_initialization.h"
 #include "local-execution/device_state_initialization.h"
 #include "realm-execution/tasks/impl/device_state_init_task.h"
+#include "realm-execution/tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "utils/containers/map_values.h"
 #include "utils/optional.h"
 #include <optional>
 #include <unordered_map>
@@ -10,8 +13,9 @@
 namespace FlexFlow {
 
 DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
-    DynamicOpenDataflowGraph const &dg,
     RealmContext &ctx,
+    DynamicOpenDataflowGraph const &dg,
+    TensorInstanceBacking const &tensor_instance_backing,
     ProfilingSettings const &profiling_settings,
     DistributedDeviceHandle const &device_handle,
     FFIterationConfig const &iteration_config,
@@ -27,6 +31,15 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
     Realm::Processor target_proc = ctx.map_device_coord_to_processor(
         assert_unwrap(invocation.node_attrs.device_coord));
 
+    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance>
+        tensor_backing = map_values(
+            subset_tensor_instance_backing_for_invocation(
+                tensor_instance_backing, invocation)
+                .backing,
+            [](std::pair<Realm::RegionInstance, Realm::Event> const &v) {
+              return v.first;
+            });
+
     // FIXME: in the absense of a real serializer we're just tossing around raw
     // bytes, which means we need to bypass the constructor for this type (yes,
     // ugh)
@@ -37,6 +50,7 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
         spawn_device_state_init_task(ctx,
                                      target_proc,
                                      invocation,
+                                     tensor_backing,
                                      profiling_settings,
                                      device_handle.at(target_proc),
                                      iteration_config,
diff --git a/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
new file mode 100644
index 0000000000..cb9382cfe0
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
@@ -0,0 +1,11 @@
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+DynamicTensorAccessor
+    dynamic_tensor_accessor_from_instance(Realm::RegionInstance const &) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/fmt/realm_event.cc b/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
index 7c5ad7d848..a5aed9481d 100644
--- a/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
+++ b/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
@@ -2,8 +2,7 @@
 
 namespace FlexFlow {
 
-std::ostream &operator<<(std::ostream &s,
-                         ::FlexFlow::Realm::Event const &m) {
+std::ostream &operator<<(std::ostream &s, ::FlexFlow::Realm::Event const &m) {
   return s << fmt::to_string(m);
 }
 
diff --git a/lib/realm-execution/src/realm-execution/hash/processor.cc b/lib/realm-execution/src/realm-execution/hash/processor.cc
index dcc1bc5d06..5a8624f676 100644
--- a/lib/realm-execution/src/realm-execution/hash/processor.cc
+++ b/lib/realm-execution/src/realm-execution/hash/processor.cc
@@ -1,6 +1,8 @@
 #include "realm-execution/hash/processor.h"
 #include <utility>
 
+#ifdef FLEXFLOW_USE_PREALM
+
 namespace std {
 
 size_t hash<::FlexFlow::Realm::Processor>::operator()(
@@ -9,3 +11,5 @@ size_t hash<::FlexFlow::Realm::Processor>::operator()(
 }
 
 } // namespace std
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index c21737300c..496c3210c0 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -6,6 +6,7 @@
 #include "realm-execution/instance_allocation.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_task_type.dtg.h"
@@ -16,6 +17,7 @@
 #include "task-spec/dynamic_graph/pass_expansion.h"
 #include "task-spec/dynamic_graph/shard_expansion.h"
 #include "task-spec/dynamic_graph/update_insertion.h"
+#include "utils/containers/map_values.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/values.h"
 #include "utils/graph/digraph/algorithms/get_topological_ordering.h"
@@ -26,9 +28,11 @@ namespace FlexFlow {
 PCGInstance::PCGInstance(
     RealmContext &ctx,
     std::vector<DynamicNodeInvocation> const &execution_order,
+    TensorInstanceBacking const &tensor_instance_backing,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<Realm::RegionInstance> logit_grad_tensor)
     : ctx(ctx), execution_order(execution_order),
+      tensor_instance_backing(tensor_instance_backing),
       optimizer_attrs(optimizer_attrs), logit_grad_tensor(logit_grad_tensor) {}
 
 RealmContext &PCGInstance::get_realm_context() {
@@ -38,6 +42,9 @@ std::vector<DynamicNodeInvocation> const &
     PCGInstance::get_execution_order() const {
   return this->execution_order;
 }
+TensorInstanceBacking const &PCGInstance::get_tensor_instance_backing() const {
+  return this->tensor_instance_backing;
+}
 OptimizerAttrs const &PCGInstance::get_optimizer_attrs() const {
   return this->optimizer_attrs;
 }
@@ -86,19 +93,23 @@ PCGInstance create_pcg_instance(
   dg = perform_shard_expansion(dg);
   TensorInstanceBacking backing = perform_instance_allocation(dg, inputs, ctx);
 
-  logit_grad_value = transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
-    for (DynamicNodeInvocation const &invocation : dg.invocations) {
-      if (invocation.node_attrs.task_type != DynamicTaskType::LOSS) {
-        continue;
-      }
-      for (auto const &[slot, value] : invocation.outputs) {
-        if (slot.slot_name == TensorSlotName::LOGIT && value.tensor_guid == lgv.tensor_guid && value.role == lgv.role) {
-          return value;
+  logit_grad_value =
+      transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
+        for (DynamicNodeInvocation const &invocation : dg.invocations) {
+          if (invocation.node_attrs.task_type != DynamicTaskType::LOSS) {
+            continue;
+          }
+          for (auto const &[slot, value] : invocation.outputs) {
+            if (slot.slot_name == TensorSlotName::LOGIT &&
+                value.tensor_guid == lgv.tensor_guid &&
+                value.role == lgv.role) {
+              return value;
+            }
+          }
         }
-      }
-    }
-    PANIC("couldn't find updated logit grad in the shard-expanded dynamic graph");
-  });
+        PANIC("couldn't find updated logit grad in the shard-expanded dynamic "
+              "graph");
+      });
 
   std::optional<Realm::RegionInstance> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
@@ -108,8 +119,9 @@ PCGInstance create_pcg_instance(
   // FIXME: for now we're going to be lazy and block on everything rather than
   // do fine-grained dependencies on instances
   dg = perform_distributed_device_state_initialization(
-      dg,
       ctx,
+      dg,
+      backing,
       profiling_settings,
       device_handle,
       iteration_config,
@@ -123,8 +135,11 @@ PCGInstance create_pcg_instance(
   std::vector<DynamicNodeInvocation> invocation_topo_order = transform(
       node_topo_order, [&](Node node) { return node_map.at_l(node); });
 
-  return PCGInstance{
-      ctx, invocation_topo_order, optimizer_attrs, logit_grad_tensor};
+  return PCGInstance{/*ctx=*/ctx,
+                     /*execution_order=*/invocation_topo_order,
+                     /*tensor_instance_backing=*/backing,
+                     /*optimizer_attrs=*/optimizer_attrs,
+                     /*logit_grad_tensor=*/logit_grad_tensor};
 
   // TODO list:
   //  * external instances
@@ -141,6 +156,7 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     execute_distributed_dynamic_node_invocation_set(
         RealmContext &ctx,
         std::vector<DynamicNodeInvocation> const &invocations,
+        TensorInstanceBacking const &tensor_instance_backing,
         OptimizerAttrs const &optimizer_attrs,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
@@ -165,9 +181,20 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
             Realm::Event::merge_events(output_dependencies));
         Realm::Processor target_proc = ctx.map_device_coord_to_processor(
             assert_unwrap(invocation.node_attrs.device_coord));
+
+        std::unordered_map<DynamicValueAttrs, Realm::RegionInstance>
+            tensor_backing = map_values(
+                subset_tensor_instance_backing_for_invocation(
+                    tensor_instance_backing, invocation)
+                    .backing,
+                [](std::pair<Realm::RegionInstance, Realm::Event> const &v) {
+                  return v.first;
+                });
+
         Realm::Event result = spawn_op_task(ctx,
                                             target_proc,
                                             invocation,
+                                            tensor_backing,
                                             profiling_settings,
                                             device_handle.at(target_proc),
                                             iteration_config,
@@ -185,32 +212,34 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_all_passes_for_pcg_instance(
-        PCGInstance &instance,
+        PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
-      instance.get_execution_order();
+      pcg_instance.get_execution_order();
   std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
       execute_distributed_dynamic_node_invocation_set(
-          /*ctx=*/instance.get_realm_context(),
+          /*ctx=*/pcg_instance.get_realm_context(),
           /*invocations=*/execution_order,
-          /*optimizer_attrs=*/instance.get_optimizer_attrs(),
+          /*tensor_instance_backing=*/
+          pcg_instance.get_tensor_instance_backing(),
+          /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
           /*profiling_settings=*/profiling_settings,
           /*device_handle=*/device_handle,
           /*iteration_config=*/iteration_config);
-  instance.update_optimizer_attrs_for_next_iter();
+  pcg_instance.update_optimizer_attrs_for_next_iter();
   return result;
 }
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_forward_pass_for_pcg_instance(
-        PCGInstance &instance,
+        PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
-      filter(instance.get_execution_order(),
+      filter(pcg_instance.get_execution_order(),
              [](DynamicNodeInvocation const &invocation) {
                DynamicTaskType task_type =
                    assert_unwrap(invocation.node_attrs.task_type);
@@ -218,9 +247,10 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
              });
 
   return execute_distributed_dynamic_node_invocation_set(
-      /*ctx=*/instance.get_realm_context(),
+      /*ctx=*/pcg_instance.get_realm_context(),
       /*invocations=*/execution_order,
-      /*optimizer_attrs=*/instance.get_optimizer_attrs(),
+      /*tensor_instance_backing=*/pcg_instance.get_tensor_instance_backing(),
+      /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
       /*profiling_settings=*/profiling_settings,
       /*device_handle=*/device_handle,
       /*iteration_config=*/iteration_config);
@@ -228,12 +258,12 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_backward_pass_for_pcg_instance(
-        PCGInstance &instance,
+        PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
-      filter(instance.get_execution_order(),
+      filter(pcg_instance.get_execution_order(),
              [](DynamicNodeInvocation const &invocation) {
                DynamicTaskType task_type =
                    assert_unwrap(invocation.node_attrs.task_type);
@@ -241,9 +271,10 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
              });
 
   return execute_distributed_dynamic_node_invocation_set(
-      /*ctx=*/instance.get_realm_context(),
+      /*ctx=*/pcg_instance.get_realm_context(),
       /*invocations=*/execution_order,
-      /*optimizer_attrs=*/instance.get_optimizer_attrs(),
+      /*tensor_instance_backing=*/pcg_instance.get_tensor_instance_backing(),
+      /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
       /*profiling_settings=*/profiling_settings,
       /*device_handle=*/device_handle,
       /*iteration_config=*/iteration_config);
@@ -251,12 +282,12 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_update_pass_for_pcg_instance(
-        PCGInstance &instance,
+        PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
-      filter(instance.get_execution_order(),
+      filter(pcg_instance.get_execution_order(),
              [](DynamicNodeInvocation const &invocation) {
                DynamicTaskType task_type =
                    assert_unwrap(invocation.node_attrs.task_type);
@@ -265,13 +296,15 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
 
   std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
       execute_distributed_dynamic_node_invocation_set(
-          /*ctx=*/instance.get_realm_context(),
+          /*ctx=*/pcg_instance.get_realm_context(),
           /*invocations=*/execution_order,
-          /*optimizer_attrs=*/instance.get_optimizer_attrs(),
+          /*tensor_instance_backing=*/
+          pcg_instance.get_tensor_instance_backing(),
+          /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
           /*profiling_settings=*/profiling_settings,
           /*device_handle=*/device_handle,
           /*iteration_config=*/iteration_config);
-  instance.update_optimizer_attrs_for_next_iter();
+  pcg_instance.update_optimizer_attrs_for_next_iter();
   return result;
 }
 
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
index 99c72cf5e7..d455b493da 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -1,11 +1,15 @@
 #include "realm-execution/tasks/impl/device_state_init_task.h"
 #include "local-execution/device_state_initialization.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
 #include "realm-execution/tasks/impl/device_state_init_return_task.h"
 #include "realm-execution/tasks/impl/device_state_init_task_args.dtg.h"
 #include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
 #include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "utils/containers/map_values.h"
 #include "utils/optional.h"
 #include <optional>
 #include <type_traits>
@@ -26,8 +30,20 @@ void device_state_init_task_body(void const *args,
   device_handle_t device_handle =
       device_handle_t_from_device_specific_managed_handle(
           task_args.device_handle, ctx.get_current_device_idx());
+
+  // Patch the invocation to include the provided instances
+  auto map_instance_to_accessor = [&](DynamicValueAttrs const &value) {
+    DynamicValueAttrs result = value;
+    result.accessor = dynamic_tensor_accessor_from_instance(
+        task_args.tensor_backing.at(value));
+    return result;
+  };
+  DynamicNodeInvocation invocation = task_args.invocation;
+  invocation.inputs = map_values(invocation.inputs, map_instance_to_accessor);
+  invocation.outputs = map_values(invocation.outputs, map_instance_to_accessor);
+
   DynamicNodeInvocation result_invocation =
-      initialize_node(task_args.invocation,
+      initialize_node(invocation,
                       ctx.get_current_device_allocator(),
                       task_args.profiling_settings,
                       device_handle,
@@ -51,6 +67,8 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
     DynamicNodeInvocation const &invocation,
+    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance> const
+        &tensor_backing,
     ProfilingSettings const &profiling_settings,
     DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
     FFIterationConfig const &iteration_config,
@@ -59,6 +77,7 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     Realm::Event precondition) {
   DeviceStateInitTaskArgs task_args{
       invocation,
+      tensor_backing,
       profiling_settings,
       device_handle,
       iteration_config,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index d8b8873442..0f65b808aa 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -1,11 +1,13 @@
 #include "realm-execution/tasks/impl/op_task.h"
 #include "local-execution/task_execution.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
 #include "realm-execution/tasks/impl/op_task_args.dtg.h"
 #include "realm-execution/tasks/impl/serializable_op_task_args.h"
 #include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "task-spec/per_device_op_state.h"
+#include "utils/containers/map_values.h"
 #include "utils/optional.h"
 #include <type_traits>
 
@@ -23,8 +25,20 @@ void op_task_body(void const *args,
   device_handle_t device_handle =
       device_handle_t_from_device_specific_managed_handle(
           task_args.device_handle, ctx.get_current_device_idx());
+
+  // Patch the invocation to include the provided instances
+  auto map_instance_to_accessor = [&](DynamicValueAttrs const &value) {
+    DynamicValueAttrs result = value;
+    result.accessor = dynamic_tensor_accessor_from_instance(
+        task_args.tensor_backing.at(value));
+    return result;
+  };
+  DynamicNodeInvocation invocation = task_args.invocation;
+  invocation.inputs = map_values(invocation.inputs, map_instance_to_accessor);
+  invocation.outputs = map_values(invocation.outputs, map_instance_to_accessor);
+
   execute_dynamic_node_invocation(
-      /*invocation=*/task_args.invocation,
+      /*invocation=*/invocation,
       /*allocator=*/ctx.get_current_device_allocator(),
       /*profiling_settings=*/task_args.profiling_settings,
       /*ff_handle=*/device_handle,
@@ -39,16 +53,19 @@ void op_task_body(void const *args,
       /*device_idx=*/ctx.get_current_device_idx());
 }
 
-Realm::Event
-    spawn_op_task(RealmContext &ctx,
-                  Realm::Processor target_proc,
-                  DynamicNodeInvocation const &invocation,
-                  ProfilingSettings const &profiling_settings,
-                  DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-                  FFIterationConfig const &iteration_config,
-                  std::optional<OptimizerAttrs> const &optimizer_attrs,
-                  Realm::Event precondition) {
+Realm::Event spawn_op_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance> const
+        &tensor_backing,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    std::optional<OptimizerAttrs> const &optimizer_attrs,
+    Realm::Event precondition) {
   OpTaskArgs task_args{invocation,
+                       tensor_backing,
                        profiling_settings,
                        device_handle,
                        iteration_config,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
index 528ff26867..59a1dd71a6 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
@@ -1,6 +1,9 @@
 #include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
 #include "realm-execution/tasks/serializer/serializable_realm_processor.h"
 #include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include "utils/containers/map_keys_and_values.h"
 
 namespace FlexFlow {
 
@@ -8,6 +11,10 @@ SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
     DeviceStateInitTaskArgs const &args) {
   return SerializableDeviceStateInitTaskArgs{
       /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
+      /*tensor_backing*/
+      map_keys_and_values(args.tensor_backing,
+                          dynamic_value_attrs_to_serializable,
+                          realm_instance_to_serializable),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/args.device_handle.serialize(),
       /*iteration_config=*/args.iteration_config,
@@ -21,6 +28,10 @@ DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
     SerializableDeviceStateInitTaskArgs const &args) {
   return DeviceStateInitTaskArgs{
       /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
+      /*tensor_backing*/
+      map_keys_and_values(args.tensor_backing,
+                          dynamic_value_attrs_from_serializable,
+                          realm_instance_from_serializable),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/
       DeviceSpecificManagedPerDeviceFFHandle::deserialize(args.device_handle),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
index 0513bc6df7..04a213e906 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
@@ -1,11 +1,18 @@
 #include "realm-execution/tasks/impl/serializable_op_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
 #include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include "utils/containers/map_keys_and_values.h"
 
 namespace FlexFlow {
 
 SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
   return SerializableOpTaskArgs{
       /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
+      /*tensor_backing*/
+      map_keys_and_values(args.tensor_backing,
+                          dynamic_value_attrs_to_serializable,
+                          realm_instance_to_serializable),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/args.device_handle.serialize(),
       /*iteration_config=*/args.iteration_config,
@@ -16,6 +23,10 @@ SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
 OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &args) {
   return OpTaskArgs{
       /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
+      /*tensor_backing*/
+      map_keys_and_values(args.tensor_backing,
+                          dynamic_value_attrs_from_serializable,
+                          realm_instance_from_serializable),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/
       DeviceSpecificManagedPerDeviceFFHandle::deserialize(args.device_handle),
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
new file mode 100644
index 0000000000..f2d42a96ca
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
@@ -0,0 +1,15 @@
+#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
+
+namespace FlexFlow {
+
+SerializableRealmInstance
+    realm_instance_to_serializable(Realm::RegionInstance const &inst) {
+  return SerializableRealmInstance{inst.id};
+}
+
+Realm::RegionInstance
+    realm_instance_from_serializable(SerializableRealmInstance const &inst) {
+  return Realm::RegionInstance{inst.id};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc b/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
index 53c2a2b271..dea51d8c92 100644
--- a/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
+++ b/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/tensor_instance_backing.h"
+#include "utils/containers/values.h"
 
 namespace FlexFlow {
 
@@ -8,4 +9,17 @@ TensorInstanceBacking make_empty_tensor_instance_backing() {
   };
 }
 
+TensorInstanceBacking subset_tensor_instance_backing_for_invocation(
+    TensorInstanceBacking const &backing,
+    DynamicNodeInvocation const &invocation) {
+  TensorInstanceBacking result = make_empty_tensor_instance_backing();
+  for (DynamicValueAttrs const &value : values(invocation.inputs)) {
+    result.backing.insert(std::pair{value, backing.backing.at(value)});
+  }
+  for (DynamicValueAttrs const &value : values(invocation.outputs)) {
+    result.backing.insert(std::pair{value, backing.backing.at(value)});
+  }
+  return result;
+}
+
 } // namespace FlexFlow

From be57504c3230bc918b6e97f6a2c17f23666e2383 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 14:42:49 -0800
Subject: [PATCH 068/113] Refactor and finish tensor instance backing.

---
 .../dynamic_tensor_accessor_from_instance.h   | 10 ++++--
 .../tasks/impl/device_state_init_task.h       |  6 ++--
 .../impl/device_state_init_task_args.dtg.toml |  4 +--
 .../realm-execution/tasks/impl/op_task.h      | 22 ++++++------
 .../tasks/impl/op_task_args.dtg.toml          |  5 ++-
 ...zable_device_state_init_task_args.dtg.toml |  4 +--
 .../impl/serializable_op_task_args.dtg.toml   |  4 +--
 .../serializable_realm_event.dtg.toml         | 17 +++++++++
 .../serializer/serializable_realm_event.h     | 14 ++++++++
 ...ializable_tensor_instance_backing.dtg.toml | 26 ++++++++++++++
 .../serializable_tensor_instance_backing.h    | 16 +++++++++
 ...distributed_device_state_initialization.cc | 12 +++----
 .../dynamic_tensor_accessor_from_instance.cc  | 36 +++++++++++++++++--
 .../pcg_instance/pcg_instance.cc              | 11 ++----
 .../tasks/impl/device_state_init_task.cc      | 10 ++++--
 .../src/realm-execution/tasks/impl/op_task.cc | 29 ++++++++-------
 ...erializable_device_state_init_task_args.cc | 12 ++-----
 .../tasks/impl/serializable_op_task_args.cc   | 12 ++-----
 .../serializer/serializable_realm_event.cc    | 14 ++++++++
 .../serializable_tensor_instance_backing.cc   | 32 +++++++++++++++++
 20 files changed, 218 insertions(+), 78 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc

diff --git a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
index 48cfbde924..8c8ccf6ac4 100644
--- a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
+++ b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
@@ -1,13 +1,19 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DYNAMIC_TENSOR_ACCESSOR_FROM_INSTANCE_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DYNAMIC_TENSOR_ACCESSOR_FROM_INSTANCE_H
 
+#include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "realm-execution/realm.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/permissions.h"
 
 namespace FlexFlow {
 
-DynamicTensorAccessor
-    dynamic_tensor_accessor_from_instance(Realm::RegionInstance const &);
+DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
+    Realm::RegionInstance inst,
+    Realm::Event ready,
+    ParallelTensorShape const &parallel_tensor_shape,
+    Permissions const &permissions,
+    Realm::Processor for_processor);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
index 9c53748916..54bddc1ddd 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
@@ -6,11 +6,10 @@
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
 #include "task-spec/device_specific_per_device_op_state.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
-#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "task-spec/ff_iteration_config.dtg.h"
-#include <unordered_map>
 
 namespace FlexFlow {
 
@@ -21,8 +20,7 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
     DynamicNodeInvocation const &invocation,
-    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance> const
-        &tensor_backing,
+    TensorInstanceBacking const &tensor_backing,
     ProfilingSettings const &profiling_settings,
     DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
     FFIterationConfig const &iteration_config,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
index 888c62af54..fbec9298dd 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
@@ -7,10 +7,10 @@ includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
   "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/tensor_instance_backing.dtg.h",
   "realm-execution/realm.h",
   "task-spec/device_specific_per_device_op_state.dtg.h",
   "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
-  "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
 ]
 
@@ -20,7 +20,7 @@ type = "::FlexFlow::DynamicNodeInvocation"
 
 [[fields]]
 name = "tensor_backing"
-type = "std::unordered_map<::FlexFlow::DynamicValueAttrs, ::FlexFlow::Realm::RegionInstance>"
+type = "TensorInstanceBacking"
 
 [[fields]]
 name = "profiling_settings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 37a801a508..330da4d2b2 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -7,6 +7,7 @@
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/ff_iteration_config.dtg.h"
 
@@ -14,17 +15,16 @@ namespace FlexFlow {
 
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event spawn_op_task(
-    RealmContext &ctx,
-    Realm::Processor target_proc,
-    DynamicNodeInvocation const &invocation,
-    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance> const
-        &tensor_backing,
-    ProfilingSettings const &profiling_settings,
-    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-    FFIterationConfig const &iteration_config,
-    std::optional<OptimizerAttrs> const &optimizer_attrs,
-    Realm::Event precondition);
+Realm::Event
+    spawn_op_task(RealmContext &ctx,
+                  Realm::Processor target_proc,
+                  DynamicNodeInvocation const &invocation,
+                  TensorInstanceBacking const &tensor_backing,
+                  ProfilingSettings const &profiling_settings,
+                  DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+                  FFIterationConfig const &iteration_config,
+                  std::optional<OptimizerAttrs> const &optimizer_attrs,
+                  Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
index 84fa384d25..2a55ffbf80 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
@@ -7,9 +7,8 @@ includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
   "realm-execution/device_specific_managed_per_device_ff_handle.h",
-  "realm-execution/realm.h",
+  "realm-execution/tensor_instance_backing.dtg.h",
   "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
-  "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
 ]
 
@@ -19,7 +18,7 @@ type = "::FlexFlow::DynamicNodeInvocation"
 
 [[fields]]
 name = "tensor_backing"
-type = "std::unordered_map<::FlexFlow::DynamicValueAttrs, ::FlexFlow::Realm::RegionInstance>"
+type = "TensorInstanceBacking"
 
 [[fields]]
 name = "profiling_settings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
index f3847c9137..034132f9d1 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
@@ -12,8 +12,8 @@ includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
   "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
-  "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h",
   "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
+  "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h",
   "task-spec/device_specific_per_device_op_state.dtg.h",
   "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
@@ -25,7 +25,7 @@ type = "::FlexFlow::SerializableDynamicNodeInvocation"
 
 [[fields]]
 name = "tensor_backing"
-type = "std::unordered_map<::FlexFlow::SerializableDynamicValueAttrs, ::FlexFlow::SerializableRealmInstance>"
+type = "::FlexFlow::SerializableTensorInstanceBacking"
 
 [[fields]]
 name = "profiling_settings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
index 3ca338689a..ac31e78d0d 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
@@ -12,7 +12,7 @@ includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
   "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
-  "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h",
+  "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h",
   "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
 ]
@@ -28,7 +28,7 @@ type = "::FlexFlow::SerializableDynamicNodeInvocation"
 
 [[fields]]
 name = "tensor_backing"
-type = "std::unordered_map<::FlexFlow::SerializableDynamicValueAttrs, ::FlexFlow::SerializableRealmInstance>"
+type = "::FlexFlow::SerializableTensorInstanceBacking"
 
 [[fields]]
 name = "profiling_settings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml
new file mode 100644
index 0000000000..3217d58608
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SerializableRealmEvent"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+[[fields]]
+name = "id"
+type = "::FlexFlow::Realm::Event::id_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h
new file mode 100644
index 0000000000..ae1f1e8265
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_EVENT_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_EVENT_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_event.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmEvent realm_event_to_serializable(Realm::Event const &);
+Realm::Event realm_event_from_serializable(SerializableRealmEvent const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml
new file mode 100644
index 0000000000..75a796b2ee
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "SerializableTensorInstanceBacking"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/tasks/serializer/serializable_realm_event.dtg.h",
+  "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/pair.h",
+  "utils/fmt/unordered_map.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::SerializableDynamicValueAttrs, std::pair<::FlexFlow::SerializableRealmInstance, ::FlexFlow::SerializableRealmEvent>>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h
new file mode 100644
index 0000000000..b536972b40
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_TENSOR_INSTANCE_BACKING_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_TENSOR_INSTANCE_BACKING_H
+
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+
+namespace FlexFlow {
+
+SerializableTensorInstanceBacking
+    tensor_instance_backing_to_serializable(TensorInstanceBacking const &);
+TensorInstanceBacking tensor_instance_backing_from_serializable(
+    SerializableTensorInstanceBacking const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
index de8060aa12..d2d876a50b 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -1,6 +1,7 @@
 #include "realm-execution/distributed_device_state_initialization.h"
 #include "local-execution/device_state_initialization.h"
 #include "realm-execution/tasks/impl/device_state_init_task.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
 #include "realm-execution/tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
@@ -31,14 +32,9 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
     Realm::Processor target_proc = ctx.map_device_coord_to_processor(
         assert_unwrap(invocation.node_attrs.device_coord));
 
-    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance>
-        tensor_backing = map_values(
-            subset_tensor_instance_backing_for_invocation(
-                tensor_instance_backing, invocation)
-                .backing,
-            [](std::pair<Realm::RegionInstance, Realm::Event> const &v) {
-              return v.first;
-            });
+    TensorInstanceBacking tensor_backing =
+        subset_tensor_instance_backing_for_invocation(tensor_instance_backing,
+                                                      invocation);
 
     // FIXME: in the absense of a real serializer we're just tossing around raw
     // bytes, which means we need to bypass the constructor for this type (yes,
diff --git a/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
index cb9382cfe0..d1c773b1fa 100644
--- a/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
+++ b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
@@ -1,11 +1,41 @@
 #include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/device_type.dtg.h"
+#include "task-spec/permissions.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
 
-DynamicTensorAccessor
-    dynamic_tensor_accessor_from_instance(Realm::RegionInstance const &) {
-  NOT_IMPLEMENTED();
+DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
+    Realm::RegionInstance inst,
+    Realm::Event ready,
+    ParallelTensorShape const &parallel_tensor_shape,
+    Permissions const &permissions,
+    Realm::Processor for_processor) {
+  ready.wait();
+
+  DeviceType device_type;
+  switch (for_processor.kind()) {
+    case Realm::Processor::LOC_PROC:
+      device_type = DeviceType::CPU;
+      break;
+    case Realm::Processor::TOC_PROC:
+      device_type = DeviceType::GPU;
+      break;
+    default:
+      PANIC("Unexpected Realm Processor kind", for_processor.kind());
+  }
+
+  size_t expected_size =
+      int{get_piece_size_in_bytes(parallel_tensor_shape).unwrap_num_bytes()};
+  void *ptr = inst.pointer_untyped(/*offset=*/0, /*datalen=*/expected_size);
+  if (permissions == Permissions::RO) {
+    return DynamicTensorAccessor{GenericTensorAccessorR{
+        get_piece_shape(parallel_tensor_shape), ptr, device_type}};
+  } else {
+    return DynamicTensorAccessor{GenericTensorAccessorW{
+        get_piece_shape(parallel_tensor_shape), ptr, device_type}};
+  }
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 496c3210c0..8390d12cc9 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -182,14 +182,9 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         Realm::Processor target_proc = ctx.map_device_coord_to_processor(
             assert_unwrap(invocation.node_attrs.device_coord));
 
-        std::unordered_map<DynamicValueAttrs, Realm::RegionInstance>
-            tensor_backing = map_values(
-                subset_tensor_instance_backing_for_invocation(
-                    tensor_instance_backing, invocation)
-                    .backing,
-                [](std::pair<Realm::RegionInstance, Realm::Event> const &v) {
-                  return v.first;
-                });
+        TensorInstanceBacking tensor_backing =
+            subset_tensor_instance_backing_for_invocation(
+                tensor_instance_backing, invocation);
 
         Realm::Event result = spawn_op_task(ctx,
                                             target_proc,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
index d455b493da..7f3f2d185c 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -34,8 +34,13 @@ void device_state_init_task_body(void const *args,
   // Patch the invocation to include the provided instances
   auto map_instance_to_accessor = [&](DynamicValueAttrs const &value) {
     DynamicValueAttrs result = value;
+    auto const &[inst, event] = task_args.tensor_backing.backing.at(value);
     result.accessor = dynamic_tensor_accessor_from_instance(
-        task_args.tensor_backing.at(value));
+        inst,
+        event,
+        assert_unwrap(value.parallel_tensor_shape),
+        Permissions::RW, // FIXME: get real permissions?
+        ctx.get_current_processor());
     return result;
   };
   DynamicNodeInvocation invocation = task_args.invocation;
@@ -67,8 +72,7 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
     DynamicNodeInvocation const &invocation,
-    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance> const
-        &tensor_backing,
+    TensorInstanceBacking const &tensor_backing,
     ProfilingSettings const &profiling_settings,
     DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
     FFIterationConfig const &iteration_config,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index 0f65b808aa..dc262bbdb1 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -7,6 +7,7 @@
 #include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "task-spec/per_device_op_state.h"
+#include "task-spec/permissions.h"
 #include "utils/containers/map_values.h"
 #include "utils/optional.h"
 #include <type_traits>
@@ -29,8 +30,13 @@ void op_task_body(void const *args,
   // Patch the invocation to include the provided instances
   auto map_instance_to_accessor = [&](DynamicValueAttrs const &value) {
     DynamicValueAttrs result = value;
+    auto const &[inst, event] = task_args.tensor_backing.backing.at(value);
     result.accessor = dynamic_tensor_accessor_from_instance(
-        task_args.tensor_backing.at(value));
+        inst,
+        event,
+        assert_unwrap(value.parallel_tensor_shape),
+        Permissions::RW, // FIXME: get real permissions?
+        ctx.get_current_processor());
     return result;
   };
   DynamicNodeInvocation invocation = task_args.invocation;
@@ -53,17 +59,16 @@ void op_task_body(void const *args,
       /*device_idx=*/ctx.get_current_device_idx());
 }
 
-Realm::Event spawn_op_task(
-    RealmContext &ctx,
-    Realm::Processor target_proc,
-    DynamicNodeInvocation const &invocation,
-    std::unordered_map<DynamicValueAttrs, Realm::RegionInstance> const
-        &tensor_backing,
-    ProfilingSettings const &profiling_settings,
-    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-    FFIterationConfig const &iteration_config,
-    std::optional<OptimizerAttrs> const &optimizer_attrs,
-    Realm::Event precondition) {
+Realm::Event
+    spawn_op_task(RealmContext &ctx,
+                  Realm::Processor target_proc,
+                  DynamicNodeInvocation const &invocation,
+                  TensorInstanceBacking const &tensor_backing,
+                  ProfilingSettings const &profiling_settings,
+                  DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+                  FFIterationConfig const &iteration_config,
+                  std::optional<OptimizerAttrs> const &optimizer_attrs,
+                  Realm::Event precondition) {
   OpTaskArgs task_args{invocation,
                        tensor_backing,
                        profiling_settings,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
index 59a1dd71a6..64669b9f1e 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
@@ -1,9 +1,7 @@
 #include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
-#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
 #include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
-#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
-#include "utils/containers/map_keys_and_values.h"
 
 namespace FlexFlow {
 
@@ -12,9 +10,7 @@ SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
   return SerializableDeviceStateInitTaskArgs{
       /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
       /*tensor_backing*/
-      map_keys_and_values(args.tensor_backing,
-                          dynamic_value_attrs_to_serializable,
-                          realm_instance_to_serializable),
+      tensor_instance_backing_to_serializable(args.tensor_backing),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/args.device_handle.serialize(),
       /*iteration_config=*/args.iteration_config,
@@ -29,9 +25,7 @@ DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
   return DeviceStateInitTaskArgs{
       /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
       /*tensor_backing*/
-      map_keys_and_values(args.tensor_backing,
-                          dynamic_value_attrs_from_serializable,
-                          realm_instance_from_serializable),
+      tensor_instance_backing_from_serializable(args.tensor_backing),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/
       DeviceSpecificManagedPerDeviceFFHandle::deserialize(args.device_handle),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
index 04a213e906..0ef2fb0442 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
@@ -1,8 +1,6 @@
 #include "realm-execution/tasks/impl/serializable_op_task_args.h"
-#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
-#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
-#include "utils/containers/map_keys_and_values.h"
 
 namespace FlexFlow {
 
@@ -10,9 +8,7 @@ SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
   return SerializableOpTaskArgs{
       /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
       /*tensor_backing*/
-      map_keys_and_values(args.tensor_backing,
-                          dynamic_value_attrs_to_serializable,
-                          realm_instance_to_serializable),
+      tensor_instance_backing_to_serializable(args.tensor_backing),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/args.device_handle.serialize(),
       /*iteration_config=*/args.iteration_config,
@@ -24,9 +20,7 @@ OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &args) {
   return OpTaskArgs{
       /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
       /*tensor_backing*/
-      map_keys_and_values(args.tensor_backing,
-                          dynamic_value_attrs_from_serializable,
-                          realm_instance_from_serializable),
+      tensor_instance_backing_from_serializable(args.tensor_backing),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/
       DeviceSpecificManagedPerDeviceFFHandle::deserialize(args.device_handle),
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc
new file mode 100644
index 0000000000..806059f3ed
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc
@@ -0,0 +1,14 @@
+#include "realm-execution/tasks/serializer/serializable_realm_event.h"
+
+namespace FlexFlow {
+
+SerializableRealmEvent realm_event_to_serializable(Realm::Event const &event) {
+  return SerializableRealmEvent{event.id};
+}
+
+Realm::Event
+    realm_event_from_serializable(SerializableRealmEvent const &event) {
+  return Realm::Event{event.id};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc
new file mode 100644
index 0000000000..79a5176c4f
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc
@@ -0,0 +1,32 @@
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
+#include "realm-execution/tasks/serializer/serializable_realm_event.h"
+#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include "utils/containers/map_keys_and_values.h"
+
+namespace FlexFlow {
+
+SerializableTensorInstanceBacking tensor_instance_backing_to_serializable(
+    TensorInstanceBacking const &backing) {
+  return SerializableTensorInstanceBacking{/*backing=*/map_keys_and_values(
+      backing.backing,
+      dynamic_value_attrs_to_serializable,
+      [](std::pair<Realm::RegionInstance, Realm::Event> const &p) {
+        return std::pair{realm_instance_to_serializable(p.first),
+                         realm_event_to_serializable(p.second)};
+      })};
+}
+
+TensorInstanceBacking tensor_instance_backing_from_serializable(
+    SerializableTensorInstanceBacking const &backing) {
+  return TensorInstanceBacking{/*backing=*/map_keys_and_values(
+      backing.backing,
+      dynamic_value_attrs_from_serializable,
+      [](std::pair<SerializableRealmInstance, SerializableRealmEvent> const
+             &p) {
+        return std::pair{realm_instance_from_serializable(p.first),
+                         realm_event_from_serializable(p.second)};
+      })};
+}
+
+} // namespace FlexFlow

From 1553644207e461e568d9895ac9feb8c05d52b38d Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 15:36:42 -0800
Subject: [PATCH 069/113] Don't execute tasks on input or weight nodes.

---
 .../src/realm-execution/pcg_instance/pcg_instance.cc     | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 8390d12cc9..2287b9d54b 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -16,6 +16,7 @@
 #include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
 #include "task-spec/dynamic_graph/pass_expansion.h"
 #include "task-spec/dynamic_graph/shard_expansion.h"
+#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
 #include "task-spec/dynamic_graph/update_insertion.h"
 #include "utils/containers/map_values.h"
 #include "utils/containers/transform.h"
@@ -166,6 +167,14 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
   DependencySet dependency_set{ctx.get_outstanding_events()};
   return unordered_map_from_pairs(
       transform(invocations, [&](DynamicNodeInvocation const &invocation) {
+        TrainingOperationAttrs op_attrs =
+            assert_unwrap(invocation.node_attrs.op_attrs);
+        if (op_attrs.is_pcg_op() && (op_attrs.require_pcg_op().is_input() ||
+                                     op_attrs.require_pcg_op().is_weight())) {
+          return std::pair{invocation.node_attrs.layer_guid,
+                           Realm::Event::NO_EVENT};
+        }
+
         std::vector<Realm::Event> input_dependencies =
             transform(vector_of(values(invocation.inputs)),
                       [&](DynamicValueAttrs const &value) {

From 38ea732a2818333960861572ad9cc766d7b13df5 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 16:03:19 -0800
Subject: [PATCH 070/113] Refactor device specific managed handle.

---
 ...ce_specific_managed_per_device_ff_handle.h | 21 ++---------
 .../realm-execution/device_specific_ptr.h     | 36 +++++++++++++++++++
 .../serializable_device_specific_ptr.h        | 32 +++++++++++++++++
 ...e_specific_managed_per_device_ff_handle.cc | 35 ------------------
 ...erializable_device_state_init_task_args.cc |  6 ++--
 .../tasks/impl/serializable_op_task_args.cc   |  6 ++--
 6 files changed, 79 insertions(+), 57 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/device_specific_ptr.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h

diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
index d48a80f438..9a42861fcd 100644
--- a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
@@ -4,28 +4,13 @@
 #include "kernels/device_handle_t.dtg.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "pcg/device_id_t.dtg.h"
-#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h"
-#include <nlohmann/json.hpp>
+#include "realm-execution/device_specific_ptr.h"
 #include <optional>
 
 namespace FlexFlow {
 
-struct DeviceSpecificManagedPerDeviceFFHandle {
-public:
-  DeviceSpecificManagedPerDeviceFFHandle() = delete;
-  explicit DeviceSpecificManagedPerDeviceFFHandle(
-      device_id_t owner, std::optional<ManagedPerDeviceFFHandle *> handle);
-
-  std::optional<ManagedPerDeviceFFHandle *> get(device_id_t device_idx) const;
-
-  SerializableDeviceSpecificPtr serialize() const;
-  static DeviceSpecificManagedPerDeviceFFHandle
-      deserialize(SerializableDeviceSpecificPtr const &j);
-
-private:
-  device_id_t owner;
-  std::optional<ManagedPerDeviceFFHandle *> handle;
-};
+using DeviceSpecificManagedPerDeviceFFHandle =
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle>;
 
 DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
     device_id_t const &, std::optional<ManagedPerDeviceFFHandle *> const &);
diff --git a/lib/realm-execution/include/realm-execution/device_specific_ptr.h b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
new file mode 100644
index 0000000000..81d41131b7
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
@@ -0,0 +1,36 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_PTR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_PTR_H
+
+#include "pcg/device_id_t.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+template <typename T>
+struct DeviceSpecificPtr {
+public:
+  DeviceSpecificPtr() = delete;
+  explicit DeviceSpecificPtr(device_id_t device_idx, std::optional<T *> handle)
+      : device_idx(device_idx), ptr(ptr) {}
+
+  std::optional<T *> get(device_id_t device_idx) const {
+    ASSERT(this->device_idx == device_idx);
+    return this->ptr;
+  }
+
+  device_id_t get_device_idx() const {
+    return this->device_idx;
+  }
+
+  std::optional<T *> get_unsafe_raw_ptr() const {
+    return this->ptr;
+  }
+
+private:
+  device_id_t device_idx;
+  std::optional<T *> ptr;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
new file mode 100644
index 0000000000..726aef84ba
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_DEVICE_SPECIFIC_PTR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_DEVICE_SPECIFIC_PTR_H
+
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h"
+
+namespace FlexFlow {
+
+template <typename T>
+SerializableDeviceSpecificPtr device_specific_ptr_to_serializable(
+    DeviceSpecificPtr<T> const &device_specific) {
+  return SerializableDeviceSpecificPtr{
+      /*device_idx=*/device_specific.get_device_idx(),
+      /*ptr=*/
+      transform(device_specific.get_unsafe_raw_ptr(),
+                [](T *ptr) { return reinterpret_cast<uintptr_t>(ptr); }),
+  };
+}
+
+template <typename T>
+DeviceSpecificPtr<T> device_specific_ptr_from_serializable(
+    SerializableDeviceSpecificPtr const &device_specific) {
+  return DeviceSpecificPtr<T>{
+      /*device_idx*/ device_specific.device_idx,
+      /*ptr=*/transform(device_specific.ptr, [](uintptr_t ptrval) {
+        return reinterpret_cast<T *>(ptrval);
+      })};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
index bcc0a22ccf..ae9fc669d3 100644
--- a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -6,41 +6,6 @@
 
 namespace FlexFlow {
 
-DeviceSpecificManagedPerDeviceFFHandle::DeviceSpecificManagedPerDeviceFFHandle(
-    device_id_t owner, std::optional<ManagedPerDeviceFFHandle *> handle)
-    : owner(owner), handle(handle) {}
-
-std::optional<ManagedPerDeviceFFHandle *>
-    DeviceSpecificManagedPerDeviceFFHandle::get(device_id_t device_idx) const {
-  ASSERT(this->owner == device_idx);
-  return this->handle;
-}
-
-SerializableDeviceSpecificPtr
-    DeviceSpecificManagedPerDeviceFFHandle::serialize() const {
-  return SerializableDeviceSpecificPtr{
-      /*device_idx=*/owner,
-      /*ptr=*/
-      transform(handle,
-                [](ManagedPerDeviceFFHandle *ptr) {
-                  return reinterpret_cast<uintptr_t>(ptr);
-                }),
-  };
-}
-
-DeviceSpecificManagedPerDeviceFFHandle
-    DeviceSpecificManagedPerDeviceFFHandle::deserialize(
-        SerializableDeviceSpecificPtr const &handle) {
-  return DeviceSpecificManagedPerDeviceFFHandle{
-      /*owner=*/handle.device_idx,
-      /*handle=*/
-      transform(handle.ptr,
-                [](uintptr_t ptrval) {
-                  return reinterpret_cast<ManagedPerDeviceFFHandle *>(ptrval);
-                }),
-  };
-}
-
 DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
     device_id_t const &device_id,
     std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
index 64669b9f1e..fed22ff393 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
 #include "realm-execution/tasks/serializer/serializable_realm_processor.h"
 #include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
@@ -12,7 +13,7 @@ SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
       /*tensor_backing*/
       tensor_instance_backing_to_serializable(args.tensor_backing),
       /*profiling_settings=*/args.profiling_settings,
-      /*device_handle=*/args.device_handle.serialize(),
+      /*device_handle=*/device_specific_ptr_to_serializable(args.device_handle),
       /*iteration_config=*/args.iteration_config,
       /*optimizer_attrs=*/args.optimizer_attrs,
       /*origin_proc=*/realm_processor_to_serializable(args.origin_proc),
@@ -28,7 +29,8 @@ DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
       tensor_instance_backing_from_serializable(args.tensor_backing),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/
-      DeviceSpecificManagedPerDeviceFFHandle::deserialize(args.device_handle),
+      device_specific_ptr_from_serializable<ManagedPerDeviceFFHandle>(
+          args.device_handle),
       /*iteration_config=*/args.iteration_config,
       /*optimizer_attrs=*/args.optimizer_attrs,
       /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
index 0ef2fb0442..80994d4298 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/tasks/impl/serializable_op_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
 #include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
 
@@ -10,7 +11,7 @@ SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
       /*tensor_backing*/
       tensor_instance_backing_to_serializable(args.tensor_backing),
       /*profiling_settings=*/args.profiling_settings,
-      /*device_handle=*/args.device_handle.serialize(),
+      /*device_handle=*/device_specific_ptr_to_serializable(args.device_handle),
       /*iteration_config=*/args.iteration_config,
       /*optimizer_attrs=*/args.optimizer_attrs,
   };
@@ -23,7 +24,8 @@ OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &args) {
       tensor_instance_backing_from_serializable(args.tensor_backing),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/
-      DeviceSpecificManagedPerDeviceFFHandle::deserialize(args.device_handle),
+      device_specific_ptr_from_serializable<ManagedPerDeviceFFHandle>(
+          args.device_handle),
       /*iteration_config=*/args.iteration_config,
       /*optimizer_attrs=*/args.optimizer_attrs,
   };

From 9f94f729f20e4d22ed190dda5f4865f6e9b85cca Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 17:26:02 -0800
Subject: [PATCH 071/113] Refactor per-device op state backing.

---
 .../realm-execution/device_specific_ptr.h     |  2 +-
 .../distributed_device_state_initialization.h |  7 ++-
 .../pcg_instance/pcg_instance.h               |  4 ++
 .../per_device_op_state_backing.dtg.toml      | 15 +++++
 .../impl/device_state_init_return_task.h      |  7 ++-
 .../tasks/impl/device_state_init_task.h       |  5 +-
 .../impl/device_state_init_task_args.dtg.toml |  3 +-
 .../realm-execution/tasks/impl/op_task.h      |  3 +
 .../tasks/impl/op_task_args.dtg.toml          |  8 ++-
 .../impl/serializable_op_task_args.dtg.toml   |  4 ++
 ...distributed_device_state_initialization.cc | 62 +++++++------------
 .../pcg_instance/pcg_instance.cc              | 61 ++++++++++--------
 .../impl/device_state_init_return_task.cc     | 12 ++--
 .../tasks/impl/device_state_init_task.cc      | 13 ++--
 .../src/realm-execution/tasks/impl/op_task.cc | 11 ++--
 ...erializable_device_state_init_task_args.cc |  2 +-
 .../tasks/impl/serializable_op_task_args.cc   |  4 ++
 17 files changed, 131 insertions(+), 92 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml

diff --git a/lib/realm-execution/include/realm-execution/device_specific_ptr.h b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
index 81d41131b7..590b7dbc74 100644
--- a/lib/realm-execution/include/realm-execution/device_specific_ptr.h
+++ b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
@@ -10,7 +10,7 @@ template <typename T>
 struct DeviceSpecificPtr {
 public:
   DeviceSpecificPtr() = delete;
-  explicit DeviceSpecificPtr(device_id_t device_idx, std::optional<T *> handle)
+  explicit DeviceSpecificPtr(device_id_t device_idx, std::optional<T *> ptr)
       : device_idx(device_idx), ptr(ptr) {}
 
   std::optional<T *> get(device_id_t device_idx) const {
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
index e257834e65..b26a69078e 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
@@ -1,9 +1,10 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_STATE_INITIALIZATION_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_STATE_INITIALIZATION_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_BACKING_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_BACKING_H
 
 #include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/per_device_op_state_backing.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
@@ -11,7 +12,7 @@
 
 namespace FlexFlow {
 
-DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
+PerDeviceOpStateBacking perform_distributed_device_state_initialization(
     RealmContext &ctx,
     DynamicOpenDataflowGraph const &dg,
     TensorInstanceBacking const &tensor_instance_backing,
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
index 1238097b2a..e754fbbf5c 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -11,6 +11,7 @@
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/per_device_op_state_backing.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
@@ -31,11 +32,13 @@ struct PCGInstance {
       RealmContext &ctx,
       std::vector<DynamicNodeInvocation> const &execution_order,
       TensorInstanceBacking const &tensor_instance_backing,
+      PerDeviceOpStateBacking const &device_state_backing,
       OptimizerAttrs const &optimizer_attrs,
       std::optional<Realm::RegionInstance> logit_grad_tensor);
   RealmContext &get_realm_context();
   std::vector<DynamicNodeInvocation> const &get_execution_order() const;
   TensorInstanceBacking const &get_tensor_instance_backing() const;
+  PerDeviceOpStateBacking const &get_device_state_backing() const;
   OptimizerAttrs const &get_optimizer_attrs() const;
   void update_optimizer_attrs_for_next_iter();
   std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
@@ -44,6 +47,7 @@ struct PCGInstance {
   RealmContext &ctx;
   std::vector<DynamicNodeInvocation> execution_order;
   TensorInstanceBacking tensor_instance_backing;
+  PerDeviceOpStateBacking device_state_backing;
   OptimizerAttrs optimizer_attrs;
   std::optional<Realm::RegionInstance> logit_grad_tensor;
 };
diff --git a/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
new file mode 100644
index 0000000000..90a9d01e69
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
@@ -0,0 +1,15 @@
+namespace = "FlexFlow"
+name = "PerDeviceOpStateBacking"
+type = "struct"
+features = []
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/device_specific_ptr.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::DynamicNodeInvocation, ::FlexFlow::DeviceSpecificPtr<::FlexFlow::PerDeviceOpState>>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
index 8f44680815..4de7e5689f 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
@@ -1,9 +1,10 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_RETURN_TASK_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_RETURN_TASK_H
 
+#include "realm-execution/device_specific_ptr.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
-#include "task-spec/device_specific_per_device_op_state.dtg.h"
+#include "task-spec/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
@@ -13,8 +14,8 @@ void device_state_init_return_task_body(
 Realm::Event spawn_device_state_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
-    DeviceSpecificPerDeviceOpState const &result,
-    DeviceSpecificPerDeviceOpState *origin_result_ptr,
+    DeviceSpecificPtr<PerDeviceOpState> const &result,
+    DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr,
     Realm::Event precondition);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
index 54bddc1ddd..657d2e8401 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
@@ -4,12 +4,13 @@
 #include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_ptr.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"
-#include "task-spec/device_specific_per_device_op_state.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/ff_iteration_config.dtg.h"
+#include "task-spec/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
@@ -25,7 +26,7 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs,
-    DeviceSpecificPerDeviceOpState *result_ptr,
+    DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
     Realm::Event precondition);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
index fbec9298dd..9a7c2781d2 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
@@ -12,6 +12,7 @@ includes = [
   "task-spec/device_specific_per_device_op_state.dtg.h",
   "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
 ]
 
 [[fields]]
@@ -44,4 +45,4 @@ type = "::FlexFlow::Realm::Processor"
 
 [[fields]]
 name = "origin_result_ptr"
-type = "::FlexFlow::DeviceSpecificPerDeviceOpState *"
+type = "::FlexFlow::DeviceSpecificPtr<PerDeviceOpState> *"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 330da4d2b2..33dcbff895 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -5,11 +5,13 @@
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_ptr.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/ff_iteration_config.dtg.h"
+#include "task-spec/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
@@ -20,6 +22,7 @@ Realm::Event
                   Realm::Processor target_proc,
                   DynamicNodeInvocation const &invocation,
                   TensorInstanceBacking const &tensor_backing,
+                  DeviceSpecificPtr<PerDeviceOpState> const &device_state,
                   ProfilingSettings const &profiling_settings,
                   DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
                   FFIterationConfig const &iteration_config,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
index 2a55ffbf80..a15c8dce11 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
@@ -7,9 +7,11 @@ includes = [
   "kernels/profiling_settings.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
   "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/device_specific_ptr.h",
   "realm-execution/tensor_instance_backing.dtg.h",
   "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
   "task-spec/ff_iteration_config.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
 ]
 
 [[fields]]
@@ -18,7 +20,11 @@ type = "::FlexFlow::DynamicNodeInvocation"
 
 [[fields]]
 name = "tensor_backing"
-type = "TensorInstanceBacking"
+type = "::FlexFlow::TensorInstanceBacking"
+
+[[fields]]
+name = "device_state"
+type = "::FlexFlow::DeviceSpecificPtr<::FlexFlow::PerDeviceOpState>"
 
 [[fields]]
 name = "profiling_settings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
index ac31e78d0d..2be0034c46 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
@@ -30,6 +30,10 @@ type = "::FlexFlow::SerializableDynamicNodeInvocation"
 name = "tensor_backing"
 type = "::FlexFlow::SerializableTensorInstanceBacking"
 
+[[fields]]
+name = "device_state"
+type = "::FlexFlow::SerializableDeviceSpecificPtr"
+
 [[fields]]
 name = "profiling_settings"
 type = "::FlexFlow::ProfilingSettings"
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
index d2d876a50b..c6d8ea3e69 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -13,7 +13,7 @@
 
 namespace FlexFlow {
 
-DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
+PerDeviceOpStateBacking perform_distributed_device_state_initialization(
     RealmContext &ctx,
     DynamicOpenDataflowGraph const &dg,
     TensorInstanceBacking const &tensor_instance_backing,
@@ -26,8 +26,16 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
   // Initialize all operators and save the per-device op state
   ASSERT(no_nodes_are_initialized(dg));
 
-  std::unordered_map<DynamicNodeInvocation, DeviceSpecificPerDeviceOpState *>
-      result_map;
+  std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
+      result;
+
+  // Preallocate output before launching tasks
+  for (DynamicNodeInvocation const &invocation : dg.invocations) {
+    result.insert(std::pair{invocation,
+                            DeviceSpecificPtr<PerDeviceOpState>{
+                                ctx.get_current_device_idx(), std::nullopt}});
+  }
+
   for (DynamicNodeInvocation const &invocation : dg.invocations) {
     Realm::Processor target_proc = ctx.map_device_coord_to_processor(
         assert_unwrap(invocation.node_attrs.device_coord));
@@ -36,47 +44,21 @@ DynamicOpenDataflowGraph perform_distributed_device_state_initialization(
         subset_tensor_instance_backing_for_invocation(tensor_instance_backing,
                                                       invocation);
 
-    // FIXME: in the absense of a real serializer we're just tossing around raw
-    // bytes, which means we need to bypass the constructor for this type (yes,
-    // ugh)
-    DeviceSpecificPerDeviceOpState *output =
-        static_cast<DeviceSpecificPerDeviceOpState *>(
-            malloc(sizeof(DeviceSpecificPerDeviceOpState)));
-    std::optional<Realm::Event> result =
-        spawn_device_state_init_task(ctx,
-                                     target_proc,
-                                     invocation,
-                                     tensor_backing,
-                                     profiling_settings,
-                                     device_handle.at(target_proc),
-                                     iteration_config,
-                                     optimizer_attrs,
-                                     output,
-                                     precondition);
-    if (result) {
-      result_map[invocation] = output;
-    } else {
-      free(output);
-    }
+    spawn_device_state_init_task(ctx,
+                                 target_proc,
+                                 invocation,
+                                 tensor_backing,
+                                 profiling_settings,
+                                 device_handle.at(target_proc),
+                                 iteration_config,
+                                 optimizer_attrs,
+                                 &result.at(invocation),
+                                 precondition);
   }
 
   ctx.get_outstanding_events().wait();
 
-  DynamicOpenDataflowGraph result = transform_dynamic_invocation_set(
-      dg, [&](DynamicNodeInvocation const &invocation) {
-        DynamicNodeInvocation result = invocation;
-        auto device_state = result_map.find(invocation);
-        if (device_state != result_map.end()) {
-          result.node_attrs.per_device_op_state = *device_state->second;
-        }
-        return result;
-      });
-
-  for (auto &[invocation, output] : result_map) {
-    free(output);
-  }
-
-  return result;
+  return PerDeviceOpStateBacking{/*backing=*/result};
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 2287b9d54b..5d1a63ba5b 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -30,10 +30,12 @@ PCGInstance::PCGInstance(
     RealmContext &ctx,
     std::vector<DynamicNodeInvocation> const &execution_order,
     TensorInstanceBacking const &tensor_instance_backing,
+    PerDeviceOpStateBacking const &device_state_backing,
     OptimizerAttrs const &optimizer_attrs,
     std::optional<Realm::RegionInstance> logit_grad_tensor)
     : ctx(ctx), execution_order(execution_order),
       tensor_instance_backing(tensor_instance_backing),
+      device_state_backing(device_state_backing),
       optimizer_attrs(optimizer_attrs), logit_grad_tensor(logit_grad_tensor) {}
 
 RealmContext &PCGInstance::get_realm_context() {
@@ -46,6 +48,9 @@ std::vector<DynamicNodeInvocation> const &
 TensorInstanceBacking const &PCGInstance::get_tensor_instance_backing() const {
   return this->tensor_instance_backing;
 }
+PerDeviceOpStateBacking const &PCGInstance::get_device_state_backing() const {
+  return this->device_state_backing;
+}
 OptimizerAttrs const &PCGInstance::get_optimizer_attrs() const {
   return this->optimizer_attrs;
 }
@@ -92,7 +97,8 @@ PCGInstance create_pcg_instance(
 
   dg = perform_update_insertion(dg, optimizer_attrs);
   dg = perform_shard_expansion(dg);
-  TensorInstanceBacking backing = perform_instance_allocation(dg, inputs, ctx);
+  TensorInstanceBacking tensor_instance_backing =
+      perform_instance_allocation(dg, inputs, ctx);
 
   logit_grad_value =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
@@ -114,20 +120,19 @@ PCGInstance create_pcg_instance(
 
   std::optional<Realm::RegionInstance> logit_grad_tensor =
       transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
-        return backing.backing.at(lgv).first;
+        return tensor_instance_backing.backing.at(lgv).first;
       });
 
-  // FIXME: for now we're going to be lazy and block on everything rather than
-  // do fine-grained dependencies on instances
-  dg = perform_distributed_device_state_initialization(
-      ctx,
-      dg,
-      backing,
-      profiling_settings,
-      device_handle,
-      iteration_config,
-      optimizer_attrs,
-      ctx.get_outstanding_events());
+  PerDeviceOpStateBacking device_state_backing =
+      perform_distributed_device_state_initialization(
+          ctx,
+          dg,
+          tensor_instance_backing,
+          profiling_settings,
+          device_handle,
+          iteration_config,
+          optimizer_attrs,
+          ctx.get_outstanding_events());
 
   // Compute the topological ordering of the graph
   auto [kwarg_graph, node_map] =
@@ -138,14 +143,13 @@ PCGInstance create_pcg_instance(
 
   return PCGInstance{/*ctx=*/ctx,
                      /*execution_order=*/invocation_topo_order,
-                     /*tensor_instance_backing=*/backing,
+                     /*tensor_instance_backing=*/tensor_instance_backing,
+                     /*device_state_backing=*/device_state_backing,
                      /*optimizer_attrs=*/optimizer_attrs,
                      /*logit_grad_tensor=*/logit_grad_tensor};
 
   // TODO list:
   //  * external instances
-  //  * task argument serializer
-  //  * pass instances to task and convert to tensor accessor
   //  * copies
   //  * parallel operator implementation (partition, reduce, gather, etc.)
   //  * and fused parallel operators (reduce + broadcast = allreduce)
@@ -158,6 +162,7 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         RealmContext &ctx,
         std::vector<DynamicNodeInvocation> const &invocations,
         TensorInstanceBacking const &tensor_instance_backing,
+        PerDeviceOpStateBacking const &device_state_backing,
         OptimizerAttrs const &optimizer_attrs,
         ProfilingSettings const &profiling_settings,
         DistributedDeviceHandle const &device_handle,
@@ -195,15 +200,17 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
             subset_tensor_instance_backing_for_invocation(
                 tensor_instance_backing, invocation);
 
-        Realm::Event result = spawn_op_task(ctx,
-                                            target_proc,
-                                            invocation,
-                                            tensor_backing,
-                                            profiling_settings,
-                                            device_handle.at(target_proc),
-                                            iteration_config,
-                                            optimizer_attrs,
-                                            dependencies);
+        Realm::Event result =
+            spawn_op_task(ctx,
+                          target_proc,
+                          invocation,
+                          tensor_backing,
+                          device_state_backing.backing.at(invocation),
+                          profiling_settings,
+                          device_handle.at(target_proc),
+                          iteration_config,
+                          optimizer_attrs,
+                          dependencies);
         for (DynamicValueAttrs const &value : values(invocation.inputs)) {
           dependency_set.add_reader(value, result);
         }
@@ -228,6 +235,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
           /*invocations=*/execution_order,
           /*tensor_instance_backing=*/
           pcg_instance.get_tensor_instance_backing(),
+          /*device_state_backing=*/pcg_instance.get_device_state_backing(),
           /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
           /*profiling_settings=*/profiling_settings,
           /*device_handle=*/device_handle,
@@ -254,6 +262,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
       /*ctx=*/pcg_instance.get_realm_context(),
       /*invocations=*/execution_order,
       /*tensor_instance_backing=*/pcg_instance.get_tensor_instance_backing(),
+      /*device_state_backing=*/pcg_instance.get_device_state_backing(),
       /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
       /*profiling_settings=*/profiling_settings,
       /*device_handle=*/device_handle,
@@ -278,6 +287,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
       /*ctx=*/pcg_instance.get_realm_context(),
       /*invocations=*/execution_order,
       /*tensor_instance_backing=*/pcg_instance.get_tensor_instance_backing(),
+      /*device_state_backing=*/pcg_instance.get_device_state_backing(),
       /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
       /*profiling_settings=*/profiling_settings,
       /*device_handle=*/device_handle,
@@ -304,6 +314,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
           /*invocations=*/execution_order,
           /*tensor_instance_backing=*/
           pcg_instance.get_tensor_instance_backing(),
+          /*device_state_backing=*/pcg_instance.get_device_state_backing(),
           /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
           /*profiling_settings=*/profiling_settings,
           /*device_handle=*/device_handle,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
index 306697e950..a1a7eb84a8 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
@@ -7,16 +7,16 @@ struct DeviceStateInitReturnTaskArgs {
 public:
   DeviceStateInitReturnTaskArgs() = delete;
   DeviceStateInitReturnTaskArgs(
-      DeviceSpecificPerDeviceOpState result,
+      DeviceSpecificPtr<PerDeviceOpState> result,
       Realm::Processor origin_proc,
-      DeviceSpecificPerDeviceOpState *origin_result_ptr)
+      DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr)
       : result(result), origin_proc(origin_proc),
         origin_result_ptr(origin_result_ptr) {}
 
 public:
-  DeviceSpecificPerDeviceOpState result;
+  DeviceSpecificPtr<PerDeviceOpState> result;
   Realm::Processor origin_proc;
-  DeviceSpecificPerDeviceOpState *origin_result_ptr;
+  DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr;
 };
 
 void device_state_init_return_task_body(void const *args,
@@ -35,8 +35,8 @@ void device_state_init_return_task_body(void const *args,
 Realm::Event spawn_device_state_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
-    DeviceSpecificPerDeviceOpState const &result,
-    DeviceSpecificPerDeviceOpState *origin_result_ptr,
+    DeviceSpecificPtr<PerDeviceOpState> const &result,
+    DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr,
     Realm::Event precondition) {
   DeviceStateInitReturnTaskArgs task_args{
       result, origin_proc, origin_result_ptr};
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
index 7f3f2d185c..50c8daffb0 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -9,7 +9,9 @@
 #include "realm-execution/tasks/task_id_t.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/per_device_op_state.h"
 #include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
 #include "utils/optional.h"
 #include <optional>
 #include <type_traits>
@@ -59,11 +61,14 @@ void device_state_init_task_body(void const *args,
       assert_unwrap(result_invocation.node_attrs.per_device_op_state);
   // Important: to make sure this doesn't get deallocated, we intentionally leak
   // the allocation here
-  DeviceSpecificPerDeviceOpState *result_state_ptr =
-      new DeviceSpecificPerDeviceOpState{result_state};
+  PerDeviceOpState *result_state_ptr =
+      new PerDeviceOpState{get_device_state_from_device_specific(
+          result_state, ctx.get_current_device_idx())};
+  DeviceSpecificPtr<PerDeviceOpState> result_device_specific{
+      ctx.get_current_device_idx(), result_state_ptr};
   spawn_device_state_init_return_task(ctx,
                                       task_args.origin_proc,
-                                      *result_state_ptr,
+                                      result_device_specific,
                                       task_args.origin_result_ptr,
                                       Realm::Event::NO_EVENT);
 }
@@ -77,7 +82,7 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs,
-    DeviceSpecificPerDeviceOpState *result_ptr,
+    DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
     Realm::Event precondition) {
   DeviceStateInitTaskArgs task_args{
       invocation,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index dc262bbdb1..e67df885d3 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -6,9 +6,11 @@
 #include "realm-execution/tasks/impl/serializable_op_task_args.h"
 #include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.h"
+#include "task-spec/per_device_op_state.dtg.h"
 #include "task-spec/per_device_op_state.h"
 #include "task-spec/permissions.h"
 #include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
 #include "utils/optional.h"
 #include <type_traits>
 
@@ -49,11 +51,8 @@ void op_task_body(void const *args,
       /*profiling_settings=*/task_args.profiling_settings,
       /*ff_handle=*/device_handle,
       /*per_device_op_state=*/
-      transform(task_args.invocation.node_attrs.per_device_op_state,
-                [&](DeviceSpecificPerDeviceOpState const &op_state) {
-                  return get_device_state_from_device_specific(
-                      op_state, ctx.get_current_device_idx());
-                }),
+      transform(task_args.device_state.get(ctx.get_current_device_idx()),
+                [](PerDeviceOpState *ptr) { return *ptr; }),
       /*iteration_config=*/task_args.iteration_config,
       /*optimizer_attrs=*/task_args.optimizer_attrs,
       /*device_idx=*/ctx.get_current_device_idx());
@@ -64,6 +63,7 @@ Realm::Event
                   Realm::Processor target_proc,
                   DynamicNodeInvocation const &invocation,
                   TensorInstanceBacking const &tensor_backing,
+                  DeviceSpecificPtr<PerDeviceOpState> const &device_state,
                   ProfilingSettings const &profiling_settings,
                   DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
                   FFIterationConfig const &iteration_config,
@@ -71,6 +71,7 @@ Realm::Event
                   Realm::Event precondition) {
   OpTaskArgs task_args{invocation,
                        tensor_backing,
+                       device_state,
                        profiling_settings,
                        device_handle,
                        iteration_config,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
index fed22ff393..2e7e02b529 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
@@ -35,7 +35,7 @@ DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
       /*optimizer_attrs=*/args.optimizer_attrs,
       /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
       /*origin_result_ptr=*/
-      reinterpret_cast<DeviceSpecificPerDeviceOpState *>(
+      reinterpret_cast<DeviceSpecificPtr<PerDeviceOpState> *>(
           args.origin_result_ptr),
   };
 }
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
index 80994d4298..a17e58da5e 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
@@ -10,6 +10,7 @@ SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
       /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
       /*tensor_backing*/
       tensor_instance_backing_to_serializable(args.tensor_backing),
+      /*device_state=*/device_specific_ptr_to_serializable(args.device_state),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/device_specific_ptr_to_serializable(args.device_handle),
       /*iteration_config=*/args.iteration_config,
@@ -22,6 +23,9 @@ OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &args) {
       /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
       /*tensor_backing*/
       tensor_instance_backing_from_serializable(args.tensor_backing),
+      /*device_state=*/
+      device_specific_ptr_from_serializable<PerDeviceOpState>(
+          args.device_state),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/
       device_specific_ptr_from_serializable<ManagedPerDeviceFFHandle>(

From db65bda89dc7afa363449a61a6cc2dbd43c32888 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 17:34:28 -0800
Subject: [PATCH 072/113] Register loss task.

---
 .../src/realm-execution/tasks/realm_task_registry.cc           | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index 914e8d1e29..fa056d6f33 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -117,6 +117,9 @@ Realm::Event register_all_tasks() {
       // Update tasks
       task_id_t::SGD_UPD_NCCL_TASK_ID,
       task_id_t::ADAM_UPD_NCCL_TASK_ID,
+
+      // Loss task
+      task_id_t::LOSS_BWD_TASK_ID,
   };
 
   for (task_id_t task_id : task_ids) {

From a07b038ab90eceed754a2abd4d19d8b39c0a10b2 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 17 Feb 2026 19:28:44 -0800
Subject: [PATCH 073/113] Test loss in Realm.

---
 .../test/src/local-execution/test_e2e.cc      |  4 +-
 .../test/src/realm-execution/test_e2e.cc      | 49 ++++++++++++++-----
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/lib/local-execution/test/src/local-execution/test_e2e.cc b/lib/local-execution/test/src/local-execution/test_e2e.cc
index a74d165a31..615ba204cf 100644
--- a/lib/local-execution/test/src/local-execution/test_e2e.cc
+++ b/lib/local-execution/test/src/local-execution/test_e2e.cc
@@ -21,8 +21,8 @@
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
-                       GenericTensorAccessorR const &last_epoch) {
+static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                              GenericTensorAccessorR const &last_epoch) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
   return tensor_accessor_all(
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 4dbfe09045..28665e840b 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -1,5 +1,11 @@
 #include "internal/realm_test_utils.h"
 #include "kernels/allocation.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include "op-attrs/tensor_slot_name.dtg.h"
 #include "pcg/device_type.dtg.h"
@@ -9,8 +15,11 @@
 #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
 #include "realm-execution/pcg_instance/pcg_instance.h"
 #include "realm-execution/realm_manager.h"
+#include "task-spec/permissions.h"
+#include "test/utils/doctest/check_kv.h"
 #include "utils/containers/require_only_key.h"
 #include <doctest/doctest.h>
 
@@ -19,6 +28,14 @@ namespace test {
 using namespace ::FlexFlow;
 namespace Realm = ::FlexFlow::Realm;
 
+static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                              GenericTensorAccessorR const &last_epoch) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+  return tensor_accessor_all(
+      compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator));
+}
+
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("RealmBackend e2e Training") {
     std::vector<char *> fake_args =
@@ -216,20 +233,28 @@ TEST_SUITE(FF_TEST_SUITE) {
             /*profiling_settings=*/ProfilingSettings{0, 0},
             /*device_handle=*/device_handle,
             /*iteration_config=*/FFIterationConfig{1_p});
-        // loss_values.push_back(copy_tensor_accessor_r(
-        //     pcg_instance.get_loss_tensor_accessor().value(),
-        //     allocator));
+        loss_values.push_back(copy_tensor_accessor_r(
+            dynamic_tensor_accessor_from_instance(
+                pcg_instance.get_loss_tensor_instance().value(),
+                Realm::Event::NO_EVENT,
+                lift_to_parallel(
+                    TensorShape{TensorDims{FFOrdered{output_dim, hidden_dim}},
+                                DataType::FLOAT}),
+                Permissions::RO,
+                ctx.get_current_processor())
+                .require_read(),
+            allocator));
       }
 
-      // // Assert that each sample in the batch has a lower loss in last epoch
-      // // than the first epoch
-      // GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
-      // GenericTensorAccessorR last_epoch_loss = loss_values.back();
-      // CHECK_MESSAGE(did_loss_decrease(first_epoch_loss, last_epoch_loss),
-      //               check_kv("first_epoch_loss",
-      //                        format_accessor_r_contents(first_epoch_loss)),
-      //               check_kv("last_epoch_loss",
-      //                        format_accessor_r_contents(last_epoch_loss)));
+      // Assert that each sample in the batch has a lower loss in last epoch
+      // than the first epoch
+      GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+      GenericTensorAccessorR last_epoch_loss = loss_values.back();
+      CHECK_MESSAGE(did_loss_decrease(first_epoch_loss, last_epoch_loss),
+                    check_kv("first_epoch_loss",
+                             format_accessor_r_contents(first_epoch_loss)),
+                    check_kv("last_epoch_loss",
+                             format_accessor_r_contents(last_epoch_loss)));
     });
   }
 }

From aa361e86e4d65957d09b4fba8214056a802a3fa3 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 18 Feb 2026 10:05:10 -0800
Subject: [PATCH 074/113] Test CPU model parallelism.

---
 lib/realm-execution/test/src/realm-execution/test_e2e.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 28665e840b..706fc002c1 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -37,9 +37,9 @@ static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
 }
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("RealmBackend e2e Training") {
+  TEST_CASE("RealmBackend e2e Training (CPU Model Parallelism)") {
     std::vector<char *> fake_args =
-        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/0_n);
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
 
@@ -149,6 +149,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
 
       MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+      MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
       ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
       MappedParallelComputationGraph mpcg{
           pcg,
@@ -165,7 +166,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                          {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
               {weights_layer_2.parallel_layer,
                MappedOperatorTaskGroup{
-                   {{cpu0,
+                   {{cpu1,
                      OperatorAtomicTaskShardBinding{
                          {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
               {linear_operator_1.parallel_layer,
@@ -178,7 +179,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                      }}}}}},
               {linear_operator_2.parallel_layer,
                MappedOperatorTaskGroup{
-                   {{cpu0,
+                   {{cpu1,
                      OperatorAtomicTaskShardBinding{{
                          {TensorSlotName::INPUT, tensor_coord0},
                          {TensorSlotName::WEIGHT, tensor_coord0},

From 6d2313a5b31404a697f2358e4086e50df3116e56 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 18 Feb 2026 10:10:47 -0800
Subject: [PATCH 075/113] Use Realm's own allocator in test.

---
 .../test/src/realm-execution/test_e2e.cc              | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 706fc002c1..02c1365039 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -3,7 +3,6 @@
 #include "kernels/compare_tensor_accessors.h"
 #include "kernels/copy_tensor_accessor.h"
 #include "kernels/format_accessor_contents.h"
-#include "kernels/local_cpu_allocator.h"
 #include "kernels/tensor_accessor_reductions.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.dtg.h"
@@ -17,6 +16,7 @@
 #include "realm-execution/distributed_device_handle.h"
 #include "realm-execution/dynamic_tensor_accessor_from_instance.h"
 #include "realm-execution/pcg_instance/pcg_instance.h"
+#include "realm-execution/realm_context.h"
 #include "realm-execution/realm_manager.h"
 #include "task-spec/permissions.h"
 #include "test/utils/doctest/check_kv.h"
@@ -28,12 +28,11 @@ namespace test {
 using namespace ::FlexFlow;
 namespace Realm = ::FlexFlow::Realm;
 
-static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+static bool did_loss_decrease(RealmContext &ctx,
+                              GenericTensorAccessorR const &first_epoch,
                               GenericTensorAccessorR const &last_epoch) {
-  Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-  return tensor_accessor_all(
-      compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator));
+  return tensor_accessor_all(compare_tensor_accessors_le(
+      last_epoch, first_epoch, ctx.get_current_device_allocator()));
 }
 
 TEST_SUITE(FF_TEST_SUITE) {

From 14eb1fca4d914b2a76cb3e565e33e874ce3ea9db Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 18 Feb 2026 11:01:21 -0800
Subject: [PATCH 076/113] Fix typo.

---
 lib/realm-execution/test/src/realm-execution/test_e2e.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 02c1365039..96a8ba49dc 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -250,7 +250,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       // than the first epoch
       GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
       GenericTensorAccessorR last_epoch_loss = loss_values.back();
-      CHECK_MESSAGE(did_loss_decrease(first_epoch_loss, last_epoch_loss),
+      CHECK_MESSAGE(did_loss_decrease(ctx, first_epoch_loss, last_epoch_loss),
                     check_kv("first_epoch_loss",
                              format_accessor_r_contents(first_epoch_loss)),
                     check_kv("last_epoch_loss",

From bb18f0d3944179e342a40ac312a5111fce0892d4 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 18 Feb 2026 16:22:19 -0800
Subject: [PATCH 077/113] Add Realm top-level README.

---
 lib/realm-execution/README.md                 | 32 +++++++++++++++++++
 .../pcg_instance/pcg_instance.cc              |  8 -----
 2 files changed, 32 insertions(+), 8 deletions(-)
 create mode 100644 lib/realm-execution/README.md

diff --git a/lib/realm-execution/README.md b/lib/realm-execution/README.md
new file mode 100644
index 0000000000..1454c7eac8
--- /dev/null
+++ b/lib/realm-execution/README.md
@@ -0,0 +1,32 @@
+The Realm backend for distributed execution.
+
+This is a single-controller implementation. That means the controller (the task that launches all other work) runs on a single node and remotely launches work onto other nodes. Aside from caveats mentioned below, this implementation is (mostly) capable of distributed execution.
+
+Major components:
+
+* `PCGInstance`: the main public interface for the Realm backend. It takes a mapped PCG and lowers it through the dynamic graph to get the fully-specified execution order of tasks to be executed. Besides the usual dynamic graph passes (pass expansion, update insertion, shard expansion), this class also tracks the allocation of Realm instances for tensors.
+* `RealmManager`: manages the initialization and shutdown of the Realm runtime. Provides the interface to launch the controller that runs the rest of the computation.
+* `RealmContext`: an interface that wraps the rest of Realm and protects against certain classes of bugs, such as shutdown bugs. **Do NOT call Realm directly unless you know what you are doing.**
+* `tasks/`: the Realm task implementations and their supporting infrastructure.
+  * `impl/`: the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
+  * `serializer/`: additional support for serializing Realm data types.
+  * `realm_task_registry.h`: manages the registration of Realm tasks. All Realm tasks go through this interface.
+  * `task_id_t.h` and `realm_task_id_t.h`: types to represent Realm tasks, along with an encoding to Realm's native task ID type.
+
+Other components used mainly within `PCGInstance`:
+
+ * `DistributedDeviceHandle`: represents a distributed device handle (i.e., device handles on all the GPUs on the system), for convenience.
+ * `DependenceSet`: tracks dependencies during execution of tasks.
+ * `distributed_device_state_initialization.h`: performs device state initialization of dynamic graph nodes and returns the resulting `PerDeviceOpStateBacking`.
+ * `instance_allocation.h`: allocates instances for tensors in the dynamic graph and returns the resulting `TensorInstanceBacking`.
+
+TODO list:
+
+* external instances
+* copies
+* task fusion
+* parallel operator implementation (partition, reduce, gather, etc.)
+* and fused parallel operators (reduce + broadcast = allreduce)
+* memory-optimizing compiler integration (tensor creation/destruction, tensor reuse)
+* control replication
+* Realm subgraphs
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 5d1a63ba5b..9f359ded10 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -147,14 +147,6 @@ PCGInstance create_pcg_instance(
                      /*device_state_backing=*/device_state_backing,
                      /*optimizer_attrs=*/optimizer_attrs,
                      /*logit_grad_tensor=*/logit_grad_tensor};
-
-  // TODO list:
-  //  * external instances
-  //  * copies
-  //  * parallel operator implementation (partition, reduce, gather, etc.)
-  //  * and fused parallel operators (reduce + broadcast = allreduce)
-  //  * memory-optimizing compiler integration (tensor creation/destruction,
-  //  tensor reuse)
 }
 
 static std::unordered_map<dynamic_layer_guid_t, Realm::Event>

From 12b70e5c71e3b5550af61a0990d2cc064e8cdbe8 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 20 Feb 2026 12:02:23 -0800
Subject: [PATCH 078/113] Add and fix GPU test (no loss so far).

---
 .../realm-execution/tasks/impl/op_task.h      |  23 +-
 .../tasks/impl/op_task_args.dtg.toml          |   2 +-
 .../impl/serializable_op_task_args.dtg.toml   |   2 +-
 ...distributed_device_state_initialization.cc |  60 +++--
 .../pcg_instance/pcg_instance.cc              |   3 +-
 .../src/realm-execution/tasks/impl/op_task.cc |  27 ++-
 .../tasks/impl/serializable_op_task_args.cc   |   9 +-
 .../test/src/realm-execution/test_e2e.cc      | 224 ++++++++++++++++++
 8 files changed, 301 insertions(+), 49 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 33dcbff895..8399742424 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -12,22 +12,23 @@
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
 #include "task-spec/ff_iteration_config.dtg.h"
 #include "task-spec/per_device_op_state.dtg.h"
+#include <optional>
 
 namespace FlexFlow {
 
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event
-    spawn_op_task(RealmContext &ctx,
-                  Realm::Processor target_proc,
-                  DynamicNodeInvocation const &invocation,
-                  TensorInstanceBacking const &tensor_backing,
-                  DeviceSpecificPtr<PerDeviceOpState> const &device_state,
-                  ProfilingSettings const &profiling_settings,
-                  DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-                  FFIterationConfig const &iteration_config,
-                  std::optional<OptimizerAttrs> const &optimizer_attrs,
-                  Realm::Event precondition);
+Realm::Event spawn_op_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    std::optional<DeviceSpecificPtr<PerDeviceOpState>> const &device_state,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    std::optional<OptimizerAttrs> const &optimizer_attrs,
+    Realm::Event precondition);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
index a15c8dce11..f6bb83fbca 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
@@ -24,7 +24,7 @@ type = "::FlexFlow::TensorInstanceBacking"
 
 [[fields]]
 name = "device_state"
-type = "::FlexFlow::DeviceSpecificPtr<::FlexFlow::PerDeviceOpState>"
+type = "std::optional<::FlexFlow::DeviceSpecificPtr<::FlexFlow::PerDeviceOpState>>"
 
 [[fields]]
 name = "profiling_settings"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
index 2be0034c46..adac6631ee 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
@@ -32,7 +32,7 @@ type = "::FlexFlow::SerializableTensorInstanceBacking"
 
 [[fields]]
 name = "device_state"
-type = "::FlexFlow::SerializableDeviceSpecificPtr"
+type = "std::optional<::FlexFlow::SerializableDeviceSpecificPtr>"
 
 [[fields]]
 name = "profiling_settings"
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
index c6d8ea3e69..5c0aff00c2 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -7,9 +7,12 @@
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/values.h"
 #include "utils/optional.h"
 #include <optional>
 #include <unordered_map>
+#include <utility>
 
 namespace FlexFlow {
 
@@ -26,16 +29,9 @@ PerDeviceOpStateBacking perform_distributed_device_state_initialization(
   // Initialize all operators and save the per-device op state
   ASSERT(no_nodes_are_initialized(dg));
 
-  std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
-      result;
-
-  // Preallocate output before launching tasks
-  for (DynamicNodeInvocation const &invocation : dg.invocations) {
-    result.insert(std::pair{invocation,
-                            DeviceSpecificPtr<PerDeviceOpState>{
-                                ctx.get_current_device_idx(), std::nullopt}});
-  }
-
+  std::unordered_map<DynamicNodeInvocation,
+                     DeviceSpecificPtr<PerDeviceOpState> *>
+      device_state_map;
   for (DynamicNodeInvocation const &invocation : dg.invocations) {
     Realm::Processor target_proc = ctx.map_device_coord_to_processor(
         assert_unwrap(invocation.node_attrs.device_coord));
@@ -44,20 +40,44 @@ PerDeviceOpStateBacking perform_distributed_device_state_initialization(
         subset_tensor_instance_backing_for_invocation(tensor_instance_backing,
                                                       invocation);
 
-    spawn_device_state_init_task(ctx,
-                                 target_proc,
-                                 invocation,
-                                 tensor_backing,
-                                 profiling_settings,
-                                 device_handle.at(target_proc),
-                                 iteration_config,
-                                 optimizer_attrs,
-                                 &result.at(invocation),
-                                 precondition);
+    DeviceSpecificPtr<PerDeviceOpState> *device_state_ptr =
+        new DeviceSpecificPtr<PerDeviceOpState>{ctx.get_current_device_idx(),
+                                                std::nullopt};
+
+    std::optional<Realm::Event> completion_event =
+        spawn_device_state_init_task(ctx,
+                                     target_proc,
+                                     invocation,
+                                     tensor_backing,
+                                     profiling_settings,
+                                     device_handle.at(target_proc),
+                                     iteration_config,
+                                     optimizer_attrs,
+                                     device_state_ptr,
+                                     precondition);
+
+    if (completion_event.has_value()) {
+      device_state_map.insert(std::pair{invocation, device_state_ptr});
+    } else {
+      // Task doesn't require initialization, clean up and don't store result
+      delete device_state_ptr;
+    }
   }
 
   ctx.get_outstanding_events().wait();
 
+  auto deref = [](DynamicNodeInvocation const &i,
+                  DeviceSpecificPtr<PerDeviceOpState> *const &p) {
+    return std::pair{i, *p};
+  };
+  std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
+      result = transform(device_state_map, deref);
+
+  for (DeviceSpecificPtr<PerDeviceOpState> *device_state_ptr :
+       values(device_state_map)) {
+    delete device_state_ptr;
+  }
+
   return PerDeviceOpStateBacking{/*backing=*/result};
 }
 
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 9f359ded10..4b08e9a430 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -20,6 +20,7 @@
 #include "task-spec/dynamic_graph/update_insertion.h"
 #include "utils/containers/map_values.h"
 #include "utils/containers/transform.h"
+#include "utils/containers/try_at.h"
 #include "utils/containers/values.h"
 #include "utils/graph/digraph/algorithms/get_topological_ordering.h"
 #include "utils/optional.h"
@@ -197,7 +198,7 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
                           target_proc,
                           invocation,
                           tensor_backing,
-                          device_state_backing.backing.at(invocation),
+                          try_at(device_state_backing.backing, invocation),
                           profiling_settings,
                           device_handle.at(target_proc),
                           iteration_config,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index e67df885d3..c7dcdb39c2 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -51,24 +51,27 @@ void op_task_body(void const *args,
       /*profiling_settings=*/task_args.profiling_settings,
       /*ff_handle=*/device_handle,
       /*per_device_op_state=*/
-      transform(task_args.device_state.get(ctx.get_current_device_idx()),
+      transform(and_then(task_args.device_state,
+                         [&](DeviceSpecificPtr<PerDeviceOpState> const &d) {
+                           return d.get(ctx.get_current_device_idx());
+                         }),
                 [](PerDeviceOpState *ptr) { return *ptr; }),
       /*iteration_config=*/task_args.iteration_config,
       /*optimizer_attrs=*/task_args.optimizer_attrs,
       /*device_idx=*/ctx.get_current_device_idx());
 }
 
-Realm::Event
-    spawn_op_task(RealmContext &ctx,
-                  Realm::Processor target_proc,
-                  DynamicNodeInvocation const &invocation,
-                  TensorInstanceBacking const &tensor_backing,
-                  DeviceSpecificPtr<PerDeviceOpState> const &device_state,
-                  ProfilingSettings const &profiling_settings,
-                  DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
-                  FFIterationConfig const &iteration_config,
-                  std::optional<OptimizerAttrs> const &optimizer_attrs,
-                  Realm::Event precondition) {
+Realm::Event spawn_op_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    std::optional<DeviceSpecificPtr<PerDeviceOpState>> const &device_state,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    std::optional<OptimizerAttrs> const &optimizer_attrs,
+    Realm::Event precondition) {
   OpTaskArgs task_args{invocation,
                        tensor_backing,
                        device_state,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
index a17e58da5e..32d54adc37 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
@@ -2,6 +2,7 @@
 #include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
 #include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+#include "utils/containers/transform.h"
 
 namespace FlexFlow {
 
@@ -10,7 +11,9 @@ SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
       /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
       /*tensor_backing*/
       tensor_instance_backing_to_serializable(args.tensor_backing),
-      /*device_state=*/device_specific_ptr_to_serializable(args.device_state),
+      /*device_state=*/
+      transform(args.device_state,
+                device_specific_ptr_to_serializable<PerDeviceOpState>),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/device_specific_ptr_to_serializable(args.device_handle),
       /*iteration_config=*/args.iteration_config,
@@ -24,8 +27,8 @@ OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &args) {
       /*tensor_backing*/
       tensor_instance_backing_from_serializable(args.tensor_backing),
       /*device_state=*/
-      device_specific_ptr_from_serializable<PerDeviceOpState>(
-          args.device_state),
+      transform(args.device_state,
+                device_specific_ptr_from_serializable<PerDeviceOpState>),
       /*profiling_settings=*/args.profiling_settings,
       /*device_handle=*/
       device_specific_ptr_from_serializable<ManagedPerDeviceFFHandle>(
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 96a8ba49dc..d9252693a1 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -259,4 +259,228 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 }
 
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training (GPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/1_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      positive_int batch_size = 10_p;
+      positive_int data_dim = 16_p;
+      positive_int hidden_dim = 32_p;
+      positive_int output_dim = 1_p;
+
+      TensorShape output_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+      GenericTensorAccessorW label_tensor_backing =
+          allocator.allocate_tensor(output_tensor_shape);
+
+      // construct computation graph
+      ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+      TensorShape label_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      TensorShape weight_shape_1 = TensorShape{
+          TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
+      TensorShape weight_shape_2 = TensorShape{
+          TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
+
+      ParallelLayerAddedResult inputs_layer =
+          pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input =
+          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_1 =
+          require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_2 =
+          require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{hidden_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_input,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_1,
+              },
+          });
+      parallel_tensor_guid_t t_linear_1 =
+          require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{output_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_linear_1,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_2,
+              },
+          });
+      parallel_tensor_guid_t t_linear_2 =
+          require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
+
+      MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
+      ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+      MappedParallelComputationGraph mpcg{
+          pcg,
+          {
+              {inputs_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {linear_operator_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+              {linear_operator_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+          },
+      };
+      MappedOperatorTaskGroup loss_mapping{
+          {{gpu0,
+            OperatorAtomicTaskShardBinding{{
+                {TensorSlotName::INPUT, tensor_coord0},
+                {TensorSlotName::LOGIT, tensor_coord0},
+            }}}}};
+
+      // instantiate computation graph
+      LossAttrs loss_attrs = LossAttrs{
+          NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                           /*momentum=*/0.9,
+                                           /*nesterov=*/false,
+                                           /*weight_decay=*/0.001}};
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedDeviceHandle device_handle = create_distributed_device_handle(
+          ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      PCGInstance pcg_instance = create_pcg_instance(
+          /*ctx=*/ctx,
+          /*mpcg=*/mpcg,
+          /*optimizer=*/optimizer_attrs,
+          /*loss=*/loss_attrs,
+          /*label_tensor=*/label_tensor,
+          /*logit_tensor=*/t_linear_2,
+          /*loss_mapping=*/loss_mapping,
+          /*input_tensors=*/input_tensors,
+          /*profiling_settings=*/ProfilingSettings{0, 0},
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/FFIterationConfig{1_p});
+
+      // begin training loop
+      int num_epochs = 5;
+      std::vector<GenericTensorAccessorR> loss_values;
+
+      for (int i = 0; i < num_epochs; i++) {
+        perform_all_passes_for_pcg_instance(
+            /*instance=*/pcg_instance,
+            /*profiling_settings=*/ProfilingSettings{0, 0},
+            /*device_handle=*/device_handle,
+            /*iteration_config=*/FFIterationConfig{1_p});
+        // loss_values.push_back(copy_tensor_accessor_r(
+        //     dynamic_tensor_accessor_from_instance(
+        //         pcg_instance.get_loss_tensor_instance().value(),
+        //         Realm::Event::NO_EVENT,
+        //         lift_to_parallel(
+        //             TensorShape{TensorDims{FFOrdered{output_dim,
+        //             hidden_dim}},
+        //                         DataType::FLOAT}),
+        //         Permissions::RO,
+        //         ctx.get_current_processor())
+        //         .require_read(),
+        //     allocator));
+      }
+
+      // // Assert that each sample in the batch has a lower loss in last epoch
+      // // than the first epoch
+      // GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+      // GenericTensorAccessorR last_epoch_loss = loss_values.back();
+      // CHECK_MESSAGE(did_loss_decrease(first_epoch_loss, last_epoch_loss),
+      //               check_kv("first_epoch_loss",
+      //                        format_accessor_r_contents(first_epoch_loss)),
+      //               check_kv("last_epoch_loss",
+      //                        format_accessor_r_contents(last_epoch_loss)));
+    });
+  }
+}
+
 } // namespace test

From 3fb928a376b90196ae91ebe7cab15d0092c29b0e Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 20 Feb 2026 13:05:23 -0800
Subject: [PATCH 079/113] Add a GPU distributed handle test.

---
 .../distributed_device_handle.cc              | 40 +++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
index fb7dff01e3..aaefe337db 100644
--- a/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
+++ b/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
@@ -24,9 +24,43 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*allowTensorOpMathConversion=*/true);
 
       // Make sure we have handles for the processors we're expecting
-      Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
-      pq.only_kind(Realm::Processor::LOC_PROC);
-      for (Realm::Processor proc : pq) {
+      Realm::Machine::ProcessorQuery cpus(Realm::Machine::get_machine());
+      cpus.only_kind(Realm::Processor::LOC_PROC);
+      CHECK(cpus.count() == 2);
+      for (Realm::Processor proc : cpus) {
+        handle.at(proc);
+      }
+    });
+  }
+}
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("DistributedDeviceHandle (GPU)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/1_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      DistributedDeviceHandle handle = create_distributed_device_handle(
+          /*ctx=*/ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      // Make sure we have handles for the processors we're expecting
+      Realm::Machine::ProcessorQuery cpus(Realm::Machine::get_machine());
+      cpus.only_kind(Realm::Processor::LOC_PROC);
+      CHECK(cpus.count() == 2);
+      for (Realm::Processor proc : cpus) {
+        handle.at(proc);
+      }
+
+      Realm::Machine::ProcessorQuery gpus(Realm::Machine::get_machine());
+      gpus.only_kind(Realm::Processor::TOC_PROC);
+      CHECK(gpus.count() == 1);
+      for (Realm::Processor proc : gpus) {
         handle.at(proc);
       }
     });

From ed730fb634079a8425ab4bb21648aeccd88230c2 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 20 Feb 2026 15:07:25 -0800
Subject: [PATCH 080/113] Test GPU loss values.

---
 .../dynamic_tensor_accessor_from_instance.cc  | 45 ++++++++++++++-----
 .../test/src/realm-execution/test_e2e.cc      | 41 +++++++++--------
 2 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
index d1c773b1fa..a2a40e3752 100644
--- a/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
+++ b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
@@ -6,6 +6,38 @@
 
 namespace FlexFlow {
 
+static DeviceType infer_device_type_from_memory_and_processor(
+    Realm::Memory inst_memory, Realm::Processor for_processor) {
+  DeviceType device_type;
+  switch (inst_memory.kind()) {
+    case Realm::Memory::SYSTEM_MEM:
+      // Only accessible on CPU
+      device_type = DeviceType::CPU;
+      break;
+    case Realm::Memory::GPU_FB_MEM:
+      // Only accessible on GPU
+      device_type = DeviceType::GPU;
+      break;
+    case Realm::Memory::Z_COPY_MEM: {
+      // Accessible on either CPU or GPU, so infer based on where we're trying
+      // to access from
+      switch (for_processor.kind()) {
+        case Realm::Processor::LOC_PROC:
+          device_type = DeviceType::CPU;
+          break;
+        case Realm::Processor::TOC_PROC:
+          device_type = DeviceType::GPU;
+          break;
+        default:
+          PANIC("Unexpected Realm Processor kind", for_processor.kind());
+      }
+    } break;
+    default:
+      PANIC("Unexpected Realm Memory kind", inst_memory.kind());
+  }
+  return device_type;
+}
+
 DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
     Realm::RegionInstance inst,
     Realm::Event ready,
@@ -14,17 +46,8 @@ DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
     Realm::Processor for_processor) {
   ready.wait();
 
-  DeviceType device_type;
-  switch (for_processor.kind()) {
-    case Realm::Processor::LOC_PROC:
-      device_type = DeviceType::CPU;
-      break;
-    case Realm::Processor::TOC_PROC:
-      device_type = DeviceType::GPU;
-      break;
-    default:
-      PANIC("Unexpected Realm Processor kind", for_processor.kind());
-  }
+  DeviceType device_type = infer_device_type_from_memory_and_processor(
+      inst.get_location(), for_processor);
 
   size_t expected_size =
       int{get_piece_size_in_bytes(parallel_tensor_shape).unwrap_num_bytes()};
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index d9252693a1..f5f7357105 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -456,29 +456,28 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
             /*profiling_settings=*/ProfilingSettings{0, 0},
             /*device_handle=*/device_handle,
             /*iteration_config=*/FFIterationConfig{1_p});
-        // loss_values.push_back(copy_tensor_accessor_r(
-        //     dynamic_tensor_accessor_from_instance(
-        //         pcg_instance.get_loss_tensor_instance().value(),
-        //         Realm::Event::NO_EVENT,
-        //         lift_to_parallel(
-        //             TensorShape{TensorDims{FFOrdered{output_dim,
-        //             hidden_dim}},
-        //                         DataType::FLOAT}),
-        //         Permissions::RO,
-        //         ctx.get_current_processor())
-        //         .require_read(),
-        //     allocator));
+        loss_values.push_back(copy_tensor_accessor_r(
+            dynamic_tensor_accessor_from_instance(
+                pcg_instance.get_loss_tensor_instance().value(),
+                Realm::Event::NO_EVENT,
+                lift_to_parallel(
+                    TensorShape{TensorDims{FFOrdered{output_dim, hidden_dim}},
+                                DataType::FLOAT}),
+                Permissions::RO,
+                ctx.get_current_processor())
+                .require_read(),
+            allocator));
       }
 
-      // // Assert that each sample in the batch has a lower loss in last epoch
-      // // than the first epoch
-      // GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
-      // GenericTensorAccessorR last_epoch_loss = loss_values.back();
-      // CHECK_MESSAGE(did_loss_decrease(first_epoch_loss, last_epoch_loss),
-      //               check_kv("first_epoch_loss",
-      //                        format_accessor_r_contents(first_epoch_loss)),
-      //               check_kv("last_epoch_loss",
-      //                        format_accessor_r_contents(last_epoch_loss)));
+      // Assert that each sample in the batch has a lower loss in last epoch
+      // than the first epoch
+      GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+      GenericTensorAccessorR last_epoch_loss = loss_values.back();
+      CHECK_MESSAGE(did_loss_decrease(ctx, first_epoch_loss, last_epoch_loss),
+                    check_kv("first_epoch_loss",
+                             format_accessor_r_contents(first_epoch_loss)),
+                    check_kv("last_epoch_loss",
+                             format_accessor_r_contents(last_epoch_loss)));
     });
   }
 }

From 4ab9617da6bc4315efac69a8e5c9fd6cba400585 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 20 Feb 2026 15:07:41 -0800
Subject: [PATCH 081/113] Update Realm to include build fixes.

---
 .flake/pkgs/realm.nix | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.flake/pkgs/realm.nix b/.flake/pkgs/realm.nix
index b809573690..9f1fb8832c 100644
--- a/.flake/pkgs/realm.nix
+++ b/.flake/pkgs/realm.nix
@@ -13,13 +13,13 @@ in
 
 stdenv.mkDerivation rec {
   pname = "realm";
-  version = "2026-02-06";
+  version = "2026-02-18";
 
   src = fetchFromGitHub {
     owner = "StanfordLegion";
     repo = "realm";
-    rev = "0405b67ca14b586f7dec0dcddee194cecee7efa6";
-    sha256 = "sha256-iUPVV1rh3QuyDKgXuu8aDlaZGlNwcpPvPsSVLWp8tr4=";
+    rev = "47f18543592cb69c5bc7c97ee7e2bc521d377d3e";
+    sha256 = "sha256-brAWh2p67hIyfrtNKN+6XZjIB0V2gYGBjdIocuwtmj4=";
   };
 
   nativeBuildInputs = [

From 36a688d613ee35ba8a877ab41c3c4b130568a3ff Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Mon, 23 Feb 2026 11:19:16 -0800
Subject: [PATCH 082/113] Ensure that Realm tests do not leak instances.

---
 .../include/realm-execution/realm_allocator.h |  2 +-
 .../src/realm-execution/realm_allocator.cc    |  4 ++
 .../test/src/realm-execution/test_e2e.cc      | 47 +++++++++++++------
 3 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_allocator.h b/lib/realm-execution/include/realm-execution/realm_allocator.h
index d72f2d7f91..e53994dd20 100644
--- a/lib/realm-execution/include/realm-execution/realm_allocator.h
+++ b/lib/realm-execution/include/realm-execution/realm_allocator.h
@@ -8,11 +8,11 @@ namespace FlexFlow {
 
 struct RealmAllocator : public IAllocator {
   RealmAllocator(Realm::Processor processor, Realm::Memory memory);
+  ~RealmAllocator();
 
   RealmAllocator() = delete;
   RealmAllocator(RealmAllocator const &) = delete;
   RealmAllocator(RealmAllocator &&) = delete;
-  ~RealmAllocator() = default;
 
   void *allocate(size_t) override;
   void deallocate(void *) override;
diff --git a/lib/realm-execution/src/realm-execution/realm_allocator.cc b/lib/realm-execution/src/realm-execution/realm_allocator.cc
index f24106b0bc..37721fbcee 100644
--- a/lib/realm-execution/src/realm-execution/realm_allocator.cc
+++ b/lib/realm-execution/src/realm-execution/realm_allocator.cc
@@ -7,6 +7,10 @@ namespace FlexFlow {
 RealmAllocator::RealmAllocator(Realm::Processor processor, Realm::Memory memory)
     : processor(processor), memory(memory) {}
 
+RealmAllocator::~RealmAllocator() {
+  ASSERT(this->ptr_instances.empty());
+}
+
 void *RealmAllocator::allocate(size_t requested_memory_size) {
   Realm::Rect<1> bounds{Realm::Point<1>::ZEROES(),
                         Realm::Point<1>{requested_memory_size} -
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index f5f7357105..1ac471d491 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -28,11 +28,14 @@ namespace test {
 using namespace ::FlexFlow;
 namespace Realm = ::FlexFlow::Realm;
 
-static bool did_loss_decrease(RealmContext &ctx,
-                              GenericTensorAccessorR const &first_epoch,
-                              GenericTensorAccessorR const &last_epoch) {
-  return tensor_accessor_all(compare_tensor_accessors_le(
-      last_epoch, first_epoch, ctx.get_current_device_allocator()));
+static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                              GenericTensorAccessorR const &last_epoch,
+                              Allocator &allocator) {
+  GenericTensorAccessorW tensor_le =
+      compare_tensor_accessors_le(last_epoch, first_epoch, allocator);
+  bool result = tensor_accessor_all(tensor_le);
+  allocator.deallocate_tensor(tensor_le);
+  return result;
 }
 
 TEST_SUITE(FF_TEST_SUITE) {
@@ -250,11 +253,18 @@ TEST_SUITE(FF_TEST_SUITE) {
       // than the first epoch
       GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
       GenericTensorAccessorR last_epoch_loss = loss_values.back();
-      CHECK_MESSAGE(did_loss_decrease(ctx, first_epoch_loss, last_epoch_loss),
-                    check_kv("first_epoch_loss",
-                             format_accessor_r_contents(first_epoch_loss)),
-                    check_kv("last_epoch_loss",
-                             format_accessor_r_contents(last_epoch_loss)));
+      CHECK_MESSAGE(
+          did_loss_decrease(first_epoch_loss, last_epoch_loss, allocator),
+          check_kv("first_epoch_loss",
+                   format_accessor_r_contents(first_epoch_loss)),
+          check_kv("last_epoch_loss",
+                   format_accessor_r_contents(last_epoch_loss)));
+
+      for (GenericTensorAccessorR const &loss_value : loss_values) {
+        allocator.deallocate_tensor(loss_value);
+      }
+      allocator.deallocate_tensor(label_tensor);
+      allocator.deallocate_tensor(label_tensor_backing);
     });
   }
 }
@@ -473,11 +483,18 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       // than the first epoch
       GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
       GenericTensorAccessorR last_epoch_loss = loss_values.back();
-      CHECK_MESSAGE(did_loss_decrease(ctx, first_epoch_loss, last_epoch_loss),
-                    check_kv("first_epoch_loss",
-                             format_accessor_r_contents(first_epoch_loss)),
-                    check_kv("last_epoch_loss",
-                             format_accessor_r_contents(last_epoch_loss)));
+      CHECK_MESSAGE(
+          did_loss_decrease(first_epoch_loss, last_epoch_loss, allocator),
+          check_kv("first_epoch_loss",
+                   format_accessor_r_contents(first_epoch_loss)),
+          check_kv("last_epoch_loss",
+                   format_accessor_r_contents(last_epoch_loss)));
+
+      for (GenericTensorAccessorR const &loss_value : loss_values) {
+        allocator.deallocate_tensor(loss_value);
+      }
+      allocator.deallocate_tensor(label_tensor);
+      allocator.deallocate_tensor(label_tensor_backing);
     });
   }
 }

From 129fd844cfc36430bf321a069b4149e188fb98cc Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 24 Feb 2026 13:36:10 -0800
Subject: [PATCH 083/113] Update Realm allocator to follow pattern of other
 allocators.

---
 .../include/realm-execution/realm_allocator.h            | 2 +-
 .../src/realm-execution/realm_allocator.cc               | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_allocator.h b/lib/realm-execution/include/realm-execution/realm_allocator.h
index e53994dd20..d716016676 100644
--- a/lib/realm-execution/include/realm-execution/realm_allocator.h
+++ b/lib/realm-execution/include/realm-execution/realm_allocator.h
@@ -8,11 +8,11 @@ namespace FlexFlow {
 
 struct RealmAllocator : public IAllocator {
   RealmAllocator(Realm::Processor processor, Realm::Memory memory);
-  ~RealmAllocator();
 
   RealmAllocator() = delete;
   RealmAllocator(RealmAllocator const &) = delete;
   RealmAllocator(RealmAllocator &&) = delete;
+  ~RealmAllocator() override;
 
   void *allocate(size_t) override;
   void deallocate(void *) override;
diff --git a/lib/realm-execution/src/realm-execution/realm_allocator.cc b/lib/realm-execution/src/realm-execution/realm_allocator.cc
index 37721fbcee..194210cf5a 100644
--- a/lib/realm-execution/src/realm-execution/realm_allocator.cc
+++ b/lib/realm-execution/src/realm-execution/realm_allocator.cc
@@ -1,6 +1,8 @@
 #include "realm-execution/realm_allocator.h"
 #include "kernels/device.h"
 #include "pcg/device_type.dtg.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/values.h"
 
 namespace FlexFlow {
 
@@ -8,7 +10,9 @@ RealmAllocator::RealmAllocator(Realm::Processor processor, Realm::Memory memory)
     : processor(processor), memory(memory) {}
 
 RealmAllocator::~RealmAllocator() {
-  ASSERT(this->ptr_instances.empty());
+  for (Realm::RegionInstance const &instance : values(this->ptr_instances)) {
+    instance.destroy(Realm::Event::NO_EVENT);
+  }
 }
 
 void *RealmAllocator::allocate(size_t requested_memory_size) {
@@ -33,6 +37,9 @@ void *RealmAllocator::allocate(size_t requested_memory_size) {
 }
 
 void RealmAllocator::deallocate(void *ptr) {
+  ASSERT(contains_key(this->ptr_instances, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
   this->ptr_instances.at(ptr).destroy(Realm::Event::NO_EVENT);
   this->ptr_instances.erase(ptr);
 }

From d64306924c019bdcadb692e4e3a71ce6ad1eabcd Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 24 Feb 2026 13:40:08 -0800
Subject: [PATCH 084/113] Remove explicit deallocation which is not required by
 updated allocator.

---
 .../test/src/realm-execution/test_e2e.cc      | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 1ac471d491..0914c054d7 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -31,11 +31,8 @@ namespace Realm = ::FlexFlow::Realm;
 static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
                               GenericTensorAccessorR const &last_epoch,
                               Allocator &allocator) {
-  GenericTensorAccessorW tensor_le =
-      compare_tensor_accessors_le(last_epoch, first_epoch, allocator);
-  bool result = tensor_accessor_all(tensor_le);
-  allocator.deallocate_tensor(tensor_le);
-  return result;
+  return tensor_accessor_all(
+      compare_tensor_accessors_le(last_epoch, first_epoch, allocator));
 }
 
 TEST_SUITE(FF_TEST_SUITE) {
@@ -259,12 +256,6 @@ TEST_SUITE(FF_TEST_SUITE) {
                    format_accessor_r_contents(first_epoch_loss)),
           check_kv("last_epoch_loss",
                    format_accessor_r_contents(last_epoch_loss)));
-
-      for (GenericTensorAccessorR const &loss_value : loss_values) {
-        allocator.deallocate_tensor(loss_value);
-      }
-      allocator.deallocate_tensor(label_tensor);
-      allocator.deallocate_tensor(label_tensor_backing);
     });
   }
 }
@@ -489,12 +480,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                    format_accessor_r_contents(first_epoch_loss)),
           check_kv("last_epoch_loss",
                    format_accessor_r_contents(last_epoch_loss)));
-
-      for (GenericTensorAccessorR const &loss_value : loss_values) {
-        allocator.deallocate_tensor(loss_value);
-      }
-      allocator.deallocate_tensor(label_tensor);
-      allocator.deallocate_tensor(label_tensor_backing);
     });
   }
 }

From 916dcf5630fb2ef4e64b60a4f43caea4e84d4554 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Mon, 23 Feb 2026 14:34:47 -0800
Subject: [PATCH 085/113] Support for PRealm.

---
 .flake/pkgs/realm.nix                         |  6 +++---
 .../distributed_device_handle.h               |  1 -
 .../include/realm-execution/hash/processor.h  | 20 -------------------
 .../realm-execution/instance_allocation.h     |  3 +++
 .../pcg_instance/pcg_instance.h               |  1 +
 .../include/realm-execution/realm.h           |  2 +-
 .../serializable_realm_instance.dtg.toml      | 10 ++++++++--
 .../tensor_instance_backing.dtg.toml          |  2 +-
 .../src/realm-execution/hash/processor.cc     | 15 --------------
 .../realm-execution/instance_allocation.cc    |  7 +++++++
 .../pcg_instance/pcg_instance.cc              |  5 +++++
 .../tasks/realm_task_registry.cc              |  7 ++++++-
 .../serializer/serializable_realm_instance.cc | 12 +++++++++--
 13 files changed, 45 insertions(+), 46 deletions(-)
 delete mode 100644 lib/realm-execution/include/realm-execution/hash/processor.h
 delete mode 100644 lib/realm-execution/src/realm-execution/hash/processor.cc

diff --git a/.flake/pkgs/realm.nix b/.flake/pkgs/realm.nix
index 9f1fb8832c..b7985b497d 100644
--- a/.flake/pkgs/realm.nix
+++ b/.flake/pkgs/realm.nix
@@ -13,13 +13,13 @@ in
 
 stdenv.mkDerivation rec {
   pname = "realm";
-  version = "2026-02-18";
+  version = "2026-02-22-prealm";
 
   src = fetchFromGitHub {
     owner = "StanfordLegion";
     repo = "realm";
-    rev = "47f18543592cb69c5bc7c97ee7e2bc521d377d3e";
-    sha256 = "sha256-brAWh2p67hIyfrtNKN+6XZjIB0V2gYGBjdIocuwtmj4=";
+    rev = "6ab01f413926a2428c3c799a345f69b4807d5595";
+    sha256 = "sha256-MN8nJ9O6oCZbbrE/ROvIlogtXJiSLsVZxoVXJUTeSHs=";
   };
 
   nativeBuildInputs = [
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
index 268be3583d..1173d75b27 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
 
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
-#include "realm-execution/hash/processor.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
 #include <unordered_map>
diff --git a/lib/realm-execution/include/realm-execution/hash/processor.h b/lib/realm-execution/include/realm-execution/hash/processor.h
deleted file mode 100644
index efe6e6186b..0000000000
--- a/lib/realm-execution/include/realm-execution/hash/processor.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_HASH_PROCESSOR_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_HASH_PROCESSOR_H
-
-#include "realm-execution/realm.h"
-#include <utility>
-
-#ifdef FLEXFLOW_USE_PREALM
-
-namespace std {
-
-template <>
-struct hash<::FlexFlow::Realm::Processor> {
-  size_t operator()(::FlexFlow::Realm::Processor const &p) const;
-};
-
-} // namespace std
-
-#endif
-
-#endif
diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
index 09709201ce..95530c0eee 100644
--- a/lib/realm-execution/include/realm-execution/instance_allocation.h
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -18,6 +18,9 @@ TensorInstanceBacking perform_instance_allocation(
         &preallocated,
     RealmContext &ctx);
 
+void destroy_instances(TensorInstanceBacking const &instances,
+                       Realm::Event precondition);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
index e754fbbf5c..db338e4e4b 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -35,6 +35,7 @@ struct PCGInstance {
       PerDeviceOpStateBacking const &device_state_backing,
       OptimizerAttrs const &optimizer_attrs,
       std::optional<Realm::RegionInstance> logit_grad_tensor);
+  ~PCGInstance();
   RealmContext &get_realm_context();
   std::vector<DynamicNodeInvocation> const &get_execution_order() const;
   TensorInstanceBacking const &get_tensor_instance_backing() const;
diff --git a/lib/realm-execution/include/realm-execution/realm.h b/lib/realm-execution/include/realm-execution/realm.h
index fe83e69583..b6913e66f5 100644
--- a/lib/realm-execution/include/realm-execution/realm.h
+++ b/lib/realm-execution/include/realm-execution/realm.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 
-// #define FLEXFLOW_USE_PREALM
+#define FLEXFLOW_USE_PREALM
 
 #ifdef FLEXFLOW_USE_PREALM
 #include <realm/prealm/prealm.h>
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
index 150801367d..5b70c6888b 100644
--- a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
@@ -12,6 +12,12 @@ includes = [
   "realm-execution/realm.h",
 ]
 
+src_includes = [
+  "utils/fmt/vector.h",
+  "utils/hash/vector.h",
+]
+
 [[fields]]
-name = "id"
-type = "::FlexFlow::Realm::RegionInstance::id_t"
+name = "instance"
+# Realm::RegionInstance has hidden fields in PRealm so we need to encode it as bytes
+type = "std::vector<uint8_t>"
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
index 6c43990282..b8533dbcc9 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -16,8 +16,8 @@ includes = [
 src_includes = [
   "realm-execution/fmt/realm_event.h",
   "realm-execution/fmt/realm_instance.h",
-  "utils/hash/unordered_map.h",
   "utils/fmt/unordered_map.h",
+  "utils/hash/unordered_map.h",
 ]
 
 [[fields]]
diff --git a/lib/realm-execution/src/realm-execution/hash/processor.cc b/lib/realm-execution/src/realm-execution/hash/processor.cc
deleted file mode 100644
index 5a8624f676..0000000000
--- a/lib/realm-execution/src/realm-execution/hash/processor.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "realm-execution/hash/processor.h"
-#include <utility>
-
-#ifdef FLEXFLOW_USE_PREALM
-
-namespace std {
-
-size_t hash<::FlexFlow::Realm::Processor>::operator()(
-    ::FlexFlow::Realm::Processor const &p) const {
-  return hash<::FlexFlow::Realm::Processor::id_t>{}(p.id);
-}
-
-} // namespace std
-
-#endif
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index 797455573c..e003e5b71a 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -72,4 +72,11 @@ TensorInstanceBacking perform_instance_allocation(
   return result;
 }
 
+void destroy_instances(TensorInstanceBacking const &instances,
+                       Realm::Event precondition) {
+  for (auto const &[instance, ready] : values(instances.backing)) {
+    instance.destroy(Realm::Event::merge_events(precondition, ready));
+  }
+}
+
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
index 4b08e9a430..d78ed68988 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -39,6 +39,11 @@ PCGInstance::PCGInstance(
       device_state_backing(device_state_backing),
       optimizer_attrs(optimizer_attrs), logit_grad_tensor(logit_grad_tensor) {}
 
+PCGInstance::~PCGInstance() {
+  destroy_instances(this->tensor_instance_backing,
+                    ctx.get_outstanding_events());
+}
+
 RealmContext &PCGInstance::get_realm_context() {
   return this->ctx;
 }
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index fa056d6f33..a9c134af01 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -17,10 +17,15 @@ Realm::Event register_task(Realm::Processor::Kind target_kind,
                                              void const *,
                                              size_t,
                                              Realm::Processor)) {
+  Realm::Processor::TaskFuncID realm_task_id =
+      get_realm_task_id_for_task_id(func_id);
+#ifdef FLEXFLOW_USE_PREALM
+  Realm::prealm_task_name(realm_task_id, fmt::format("{}", func_id));
+#endif
   return Realm::Processor::register_task_by_kind(
       target_kind,
       /*global=*/false,
-      get_realm_task_id_for_task_id(func_id),
+      realm_task_id,
       Realm::CodeDescriptor(task_body),
       Realm::ProfilingRequestSet());
 }
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
index f2d42a96ca..0e58d6e36c 100644
--- a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
@@ -1,15 +1,23 @@
 #include "realm-execution/tasks/serializer/serializable_realm_instance.h"
+#include "utils/exception.h"
+#include <type_traits>
 
 namespace FlexFlow {
 
+// Realm::RegionInstance is trivially copyable so it's safe to treat it as bytes
+static_assert(std::is_trivially_copy_constructible_v<Realm::RegionInstance>);
+
 SerializableRealmInstance
     realm_instance_to_serializable(Realm::RegionInstance const &inst) {
-  return SerializableRealmInstance{inst.id};
+  uint8_t const *data = reinterpret_cast<uint8_t const *>(&inst);
+  return SerializableRealmInstance{
+      std::vector<uint8_t>{data, data + sizeof(inst)}};
 }
 
 Realm::RegionInstance
     realm_instance_from_serializable(SerializableRealmInstance const &inst) {
-  return Realm::RegionInstance{inst.id};
+  ASSERT(inst.instance.size() == sizeof(Realm::RegionInstance));
+  return *reinterpret_cast<Realm::RegionInstance const *>(inst.instance.data());
 }
 
 } // namespace FlexFlow

From c4eeb3f7cd8ea2a18d7598c929bed1de10bc1421 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Tue, 24 Feb 2026 14:08:29 -0800
Subject: [PATCH 086/113] Update to Realm main commit for PRealm.

---
 .flake/pkgs/realm.nix | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.flake/pkgs/realm.nix b/.flake/pkgs/realm.nix
index b7985b497d..336b1c050c 100644
--- a/.flake/pkgs/realm.nix
+++ b/.flake/pkgs/realm.nix
@@ -13,13 +13,13 @@ in
 
 stdenv.mkDerivation rec {
   pname = "realm";
-  version = "2026-02-22-prealm";
+  version = "2026-02-24";
 
   src = fetchFromGitHub {
     owner = "StanfordLegion";
     repo = "realm";
-    rev = "6ab01f413926a2428c3c799a345f69b4807d5595";
-    sha256 = "sha256-MN8nJ9O6oCZbbrE/ROvIlogtXJiSLsVZxoVXJUTeSHs=";
+    rev = "42f7484a80e0bdacaf47d9a758822f5327348dd0";
+    sha256 = "sha256-IHiokPmTjEV5df3fr1Xubuyt2N1CFI2fA7Q2TsbxS3Y=";
   };
 
   nativeBuildInputs = [

From 3ef8ee95da024bd0f43c0a11c804319af90da9dd Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 25 Feb 2026 13:46:29 -0800
Subject: [PATCH 087/113] Add a switch to control PRealm.

---
 CMakeLists.txt                                              | 1 +
 cmake/flexflow-utils.cmake                                  | 1 +
 lib/realm-execution/include/realm-execution/realm.h         | 6 ++----
 .../src/realm-execution/tasks/realm_task_registry.cc        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc1a296dbe..4723a3168d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ set(FF_MAX_NUM_TASK_REGIONS "20" CACHE STRING
 set(FF_MAX_NUM_TASK_ARGUMENTS "5" CACHE STRING
   "Maximum number of arguments that can be declared in a TaskSignature")
 option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
+option(FF_USE_PREALM "Build with PRealm profiling interface" ON)
 option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)
 option(FF_USE_PYTHON "Enable Python" ON)
 option(FF_BUILD_FROM_PYPI "Build from pypi" OFF)
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index ef5d6d9d11..795668e32a 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -17,6 +17,7 @@ function(define_ff_vars target)
     MAX_NUM_FUSED_TENSORS=${FF_MAX_NUM_FUSED_TENSORS}
     MAX_NUM_WORKERS=${FF_MAX_NUM_WORKERS}
     FF_USE_NCCL=${FF_USE_NCCL}
+    FF_USE_PREALM=${FF_USE_PREALM}
     MAX_TENSOR_DIM=${FF_MAX_DIM}
     MAX_NUM_TASK_REGIONS=${FF_MAX_NUM_TASK_REGIONS}
     MAX_NUM_TASK_ARGUMENTS=${FF_MAX_NUM_TASK_ARGUMENTS}
diff --git a/lib/realm-execution/include/realm-execution/realm.h b/lib/realm-execution/include/realm-execution/realm.h
index b6913e66f5..814132d355 100644
--- a/lib/realm-execution/include/realm-execution/realm.h
+++ b/lib/realm-execution/include/realm-execution/realm.h
@@ -1,9 +1,7 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
 
-#define FLEXFLOW_USE_PREALM
-
-#ifdef FLEXFLOW_USE_PREALM
+#ifdef FF_USE_PREALM
 #include <realm/prealm/prealm.h>
 #else
 #include <realm.h>
@@ -11,7 +9,7 @@
 
 namespace FlexFlow {
 
-#ifdef FLEXFLOW_USE_PREALM
+#ifdef FF_USE_PREALM
 namespace Realm = ::PRealm;
 #else
 namespace Realm = ::Realm;
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index a9c134af01..09d99655c0 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -19,7 +19,7 @@ Realm::Event register_task(Realm::Processor::Kind target_kind,
                                              Realm::Processor)) {
   Realm::Processor::TaskFuncID realm_task_id =
       get_realm_task_id_for_task_id(func_id);
-#ifdef FLEXFLOW_USE_PREALM
+#ifdef FF_USE_PREALM
   Realm::prealm_task_name(realm_task_id, fmt::format("{}", func_id));
 #endif
   return Realm::Processor::register_task_by_kind(

From b54353d995d5e8358ce17de686fa1402873de059 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 27 Feb 2026 10:15:00 -0800
Subject: [PATCH 088/113] Update rect constructor.

---
 .../include/realm-execution/realm_context.h           |  1 +
 .../src/realm-execution/realm_context.cc              | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index b8baad41b9..b018a04a87 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -4,6 +4,7 @@
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
 #include "kernels/managed_per_device_ff_handle.h"
+#include "op-attrs/tensor_shape.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "realm-execution/realm.h"
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 3427e8cbee..10ed07118b 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -137,12 +137,13 @@ Realm::Event RealmContext::collective_spawn_task(Realm::Processor target_proc,
   return result;
 }
 
-template <int N>
-static Realm::Rect<N> rect_from_dims(TensorDims const &dims) {
+template <int N, typename T = int>
+static Realm::Rect<N, T> rect_from_dims(TensorDims const &dims) {
   std::vector<int> values{dims.ff_ordered.begin(), dims.ff_ordered.end()};
-  return Realm::Rect<N>{Realm::Point<N>::ZEROES(),
-                        Realm::Point<N>{values.data()} -
-                            Realm::Point<N>::ONES()};
+  ASSERT(values.size() == N);
+  return Realm::Rect<N, T>{Realm::Point<N, T>::ZEROES(),
+                           Realm::Point<N, T>{values.data()} -
+                               Realm::Point<N, T>::ONES()};
 }
 
 std::pair<Realm::RegionInstance, Realm::Event>

From 218408864b58534f60aebb64376112c3ecd7575b Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 4 Mar 2026 01:48:25 -0800
Subject: [PATCH 089/113] Pivot docs to be primarily through doxygen

---
 .editorconfig                                 |   5 +
 bin/README.md                                 |   9 -
 .../{README.md => index.dox}                  |  29 +-
 .../src/export-model-arch/main.cc             |   2 +
 bin/index.dox                                 |  11 +
 docs/doxygen/Doxyfile                         |  19 +-
 lib/README.md                                 |  12 -
 .../op_cost_estimate_key.dtg.toml             |   6 +
 .../compiler/machine_mapping/index.dox        |  12 +
 .../compiler/unity_algorithm/index.dox        |   8 +
 lib/compiler/index.dox                        |  12 +
 lib/index.dox                                 |  29 ++
 lib/kernels/index.dox                         |   5 +
 .../include/local-execution/README.md         |  13 -
 lib/local-execution/index.dox                 |  19 +
 lib/models/include/models/bert/bert.h         |   4 +-
 .../include/models/candle_uno/candle_uno.h    |   6 +-
 lib/models/include/models/dlrm/dlrm.h         |   4 +-
 .../include/models/transformer/transformer.h  |   6 +-
 lib/models/index.dox                          |  15 +
 .../src/models/split_test/split_test.cc       |   2 +
 lib/op-attrs/include/op-attrs/ops/index.dox   |   5 +
 lib/op-attrs/index.dox                        |  17 +
 .../include/pcg/computation_graph_builder.h   |   6 +
 lib/pcg/include/pcg/file_format/v1/index.dox  |   5 +
 lib/pcg/index.dox                             |  23 +
 lib/realm-execution/README.md                 |  32 --
 .../include/realm-execution/dependency_set.h  |   3 +
 .../pcg_instance/pcg_instance.h               |  14 +-
 .../include/realm-execution/realm_context.h   |   6 +
 .../include/realm-execution/realm_manager.h   |   6 +
 .../realm-execution/tasks/impl/op_task.h      |  26 +
 .../realm-execution/tasks/task_id_t.dtg.toml  |  13 +
 lib/realm-execution/index.dox                 |  38 ++
 .../tasks/impl/device_handle_init_task.cc     |   9 +-
 .../src/realm-execution/tasks/impl/op_task.cc |  25 +-
 .../tasks/realm_task_registry.cc              |   2 +-
 lib/runtime/src/machine_model.cc              |   2 +-
 lib/runtime/src/task_spec/README.md           | 176 -------
 lib/substitutions/README.md                   |  34 --
 .../substitutions/substitution.dtg.toml       |  12 +
 lib/substitutions/index.dox                   |  26 +
 lib/task-spec/index.dox                       |   5 +
 lib/utils/README.md                           | 449 ------------------
 lib/utils/include/utils/cli/index.dox         |   9 +
 lib/utils/include/utils/containers/index.dox  |  12 +
 lib/utils/include/utils/graph/README.md       |  28 +-
 lib/utils/include/utils/graph/index.dox       |  50 ++
 lib/utils/index.dox                           |   9 +
 49 files changed, 485 insertions(+), 785 deletions(-)
 delete mode 100644 bin/README.md
 rename bin/export-model-arch/{README.md => index.dox} (76%)
 create mode 100644 bin/index.dox
 delete mode 100644 lib/README.md
 create mode 100644 lib/compiler/include/compiler/machine_mapping/index.dox
 create mode 100644 lib/compiler/include/compiler/unity_algorithm/index.dox
 create mode 100644 lib/compiler/index.dox
 create mode 100644 lib/index.dox
 create mode 100644 lib/kernels/index.dox
 delete mode 100644 lib/local-execution/include/local-execution/README.md
 create mode 100644 lib/local-execution/index.dox
 create mode 100644 lib/models/index.dox
 create mode 100644 lib/op-attrs/include/op-attrs/ops/index.dox
 create mode 100644 lib/op-attrs/index.dox
 create mode 100644 lib/pcg/include/pcg/file_format/v1/index.dox
 create mode 100644 lib/pcg/index.dox
 delete mode 100644 lib/realm-execution/README.md
 create mode 100644 lib/realm-execution/index.dox
 delete mode 100644 lib/runtime/src/task_spec/README.md
 delete mode 100644 lib/substitutions/README.md
 create mode 100644 lib/substitutions/index.dox
 create mode 100644 lib/task-spec/index.dox
 delete mode 100644 lib/utils/README.md
 create mode 100644 lib/utils/include/utils/cli/index.dox
 create mode 100644 lib/utils/include/utils/containers/index.dox
 create mode 100644 lib/utils/include/utils/graph/index.dox
 create mode 100644 lib/utils/index.dox

diff --git a/.editorconfig b/.editorconfig
index 71bcacde7d..f339516d70 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -4,6 +4,7 @@ root = true
 [*]
 end_of_line = lf
 insert_final_newline = true
+trim_trailing_whitespace = true
 
 [{CMakeLists.txt,*.cmake}]
 indent_style = space
@@ -24,3 +25,7 @@ indent_size = 2
 [*.md]
 indent_style = space
 indent_size = 2
+
+[*.dox]
+indent_style = space
+indent_size = 2
diff --git a/bin/README.md b/bin/README.md
deleted file mode 100644
index d0b8ccd018..0000000000
--- a/bin/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# bin
-
-This directory contains command-line interfaces for FlexFlow Train and associated tools (all in C++). 
-A short description of each is included below--more information can be found in the `README.md` files
-in each of the corresponding directories (e.g., [here](./export-model-arch/README.md) for `export-model-arch`):
-
-- `export-model-arch`: Exports the model computation graphs defined in the [models](../lib/models/) library as either JSON (for use outside of FlexFlow) or as DOT (for visualization). Can also optionally export the SP decompositions of the computation graphs.
-- `substitution-to-dot`: Converts TASO-generated substitutions from the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)) into DOT for visualization.
-- `protobuf-to-json`: Converts TASO-generated substitutions from the legacy protobuf format ([example](../substitutions/graph_subst_3_v2.pb)) to the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)). Will be removed in the future once the substitution generator is integrated natively into FlexFlow Train (tracked in [#351](https://github.com/flexflow/flexflow-train/issues/351)).
diff --git a/bin/export-model-arch/README.md b/bin/export-model-arch/index.dox
similarity index 76%
rename from bin/export-model-arch/README.md
rename to bin/export-model-arch/index.dox
index 80b6c3ef04..d804a424b7 100644
--- a/bin/export-model-arch/README.md
+++ b/bin/export-model-arch/index.dox
@@ -1,27 +1,34 @@
-# export-model-arch
+/**
 
-A tool for exporting and visualizing the model computation graphs defined in [models](../lib/models).
-To build and run `export-model-arch`, run the following commands from the root of the FlexFlow Train repository:
-```console
+@page export-model-arch export-model-arch
+
+A tool for exporting and visualizing the model computation graphs defined in @ref models.
+To build and run export-model-arch, run the following commands from the root of the FlexFlow Train repository:
+
+\verbatim
 $ proj cmake # if you haven't already
 ...
 $ proj build
 ...
 $ ./build/normal/bin/export-model-arch/export-model-arch -h
-```
+\endverbatim
+
 The above should print the help message for `export-model-arch`. A few example commands are also listed below:
 
 - Export the `split_test` model in JSON (e.g., for processing outside of FlexFlow Train):
-```console
+
+\verbatim
 $ ./build/normal/bin/export-model-arch/export-model-arch split_test
-```
+\endverbatim
 
 - Export the `split_test` model in JSON along with the SP decomposition of the model's computation graph:
-```console
+\verbatim
 $ ./build/normal/bin/export-model-arch/export-model-arch --sp-decomposition split_test
-```
+\endverbatim
 
 - Export the `split_test` model as DOT (e.g., for visualization using a [local](https://github.com/jrfonseca/xdot.py) or [web-based](https://dreampuf.github.io/GraphvizOnline/) DOT viewer)
-```console
+\verbatim
 $ ./build/normal/bin/export-model-arch/export-model-arch --dot split_test
-```
+\endverbatim
+
+*/
diff --git a/bin/export-model-arch/src/export-model-arch/main.cc b/bin/export-model-arch/src/export-model-arch/main.cc
index 29be28b0ef..0c2cfbdb6b 100644
--- a/bin/export-model-arch/src/export-model-arch/main.cc
+++ b/bin/export-model-arch/src/export-model-arch/main.cc
@@ -118,6 +118,7 @@ tl::expected<JsonSPModelExport, std::string>
 }
 
 int main(int argc, char **argv) {
+//! [utils/cli example]
   CLISpec cli = empty_cli_spec();
 
   CLIArgumentKey arg_key_help = cli_add_help_flag(cli);
@@ -182,6 +183,7 @@ int main(int argc, char **argv) {
   bool sp_decompositition = cli_get_flag(parsed, key_sp_decomposition);
   bool dot = cli_get_flag(parsed, key_dot);
   bool preprocessed_dot = cli_get_flag(parsed, key_preprocessed_dot);
+//! [utils/cli example]
 
   auto handle_error = [](auto const &result) {
     if (!result.has_value()) {
diff --git a/bin/index.dox b/bin/index.dox
new file mode 100644
index 0000000000..08d73c1d81
--- /dev/null
+++ b/bin/index.dox
@@ -0,0 +1,11 @@
+/**
+
+\mainpage bin
+
+This directory contains command-line interfaces for %FlexFlow %Train and associated tools (all in C++).
+
+- \subpage export-model-arch "": Exports the model computation graphs defined in the @ref models library as either JSON (for use outside of FlexFlow) or as DOT (for visualization). Can also optionally export the SP decompositions of the computation graphs.
+- \subpage substitution-to-dot "": Converts TASO-generated substitutions from the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)) into DOT for visualization.
+- \subpage protobuf-to-json "": Converts TASO-generated substitutions from the legacy protobuf format ([example](../substitutions/graph_subst_3_v2.pb)) to the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)). Will be removed in the future once the substitution generator is integrated natively into FlexFlow Train (tracked in [#351](https://github.com/flexflow/flexflow-train/issues/351)).
+
+*/
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 32b8da3828..d68de44c85 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -372,7 +372,7 @@ TOC_INCLUDE_HEADINGS   = 5
 # The default value is: DOXYGEN.
 # This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
 
-MARKDOWN_ID_STYLE      = DOXYGEN
+MARKDOWN_ID_STYLE      = GITHUB
 
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
@@ -390,7 +390,7 @@ AUTOLINK_SUPPORT       = YES
 # diagrams that involve STL classes more complete and accurate.
 # The default value is: NO.
 
-BUILTIN_STL_SUPPORT    = NO
+BUILTIN_STL_SUPPORT    = YES
 
 # If you use Microsoft's C++/CLI language, you should set this option to YES to
 # enable parsing support.
@@ -636,7 +636,7 @@ CASE_SENSE_NAMES       = YES
 # scope will be hidden.
 # The default value is: NO.
 
-HIDE_SCOPE_NAMES       = NO
+HIDE_SCOPE_NAMES       = YES
 
 # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
 # append additional text to a page's title, such as Class Reference. If set to
@@ -990,8 +990,9 @@ FILE_PATTERNS          = *.c \
                          *.cu \
                          *.h \
                          *.hpp \
-                         *.md \
-                         *.py
+                         *.py \
+                         *.dox \
+                         *.md
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -1006,7 +1007,8 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                =
+EXCLUDE                = lib/realm-execution/include/realm-execution/realm.h \
+                         lib/runtime/
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -1036,7 +1038,8 @@ EXCLUDE_SYMBOLS        =
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           =
+EXAMPLE_PATH           = $(FF_HOME)/lib \
+                         $(FF_HOME)/bin
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -1050,7 +1053,7 @@ EXAMPLE_PATTERNS       = *
 # irrespective of the value of the RECURSIVE tag.
 # The default value is: NO.
 
-EXAMPLE_RECURSIVE      = NO
+EXAMPLE_RECURSIVE      = YES
 
 # The IMAGE_PATH tag can be used to specify one or more files or directories
 # that contain images that are to be included in the documentation (see the
diff --git a/lib/README.md b/lib/README.md
deleted file mode 100644
index 5600c8e6aa..0000000000
--- a/lib/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# C++ Library Code
-
-This directory contains the core C++ code that underlies FlexFlow, organized into the following libraries:
-
-- `compiler`: Contains 
-- `kernels`:
-- `op-attrs`:
-- `pcg`: Contains the definitions of computation graphs and parallel computation graphs,
-         as well as code for serializing and deserializing both graphs
-- `runtime`:
-- `substitutions`: Contains the definitions of pcg substitutions, as well as the code for serializing them
-- `utils`:
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.dtg.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.dtg.toml
index 6a3d4987ac..42435312c3 100644
--- a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.dtg.toml
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.dtg.toml
@@ -7,6 +7,12 @@ features = [
   "fmt",
   "hash",
 ]
+docstring = """
+@brief The minimum amount of information needed to compute the cost of an
+operator (runtime and memory).
+
+For the runtime-only analogue, see RuntimeOnlyOpCostEstimateKey
+"""
 
 includes = [
   "op-attrs/pcg_operator_attrs.dtg.h",
diff --git a/lib/compiler/include/compiler/machine_mapping/index.dox b/lib/compiler/include/compiler/machine_mapping/index.dox
new file mode 100644
index 0000000000..2858103b2d
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/index.dox
@@ -0,0 +1,12 @@
+/**
+
+@page machine-mapping Machine Mapping
+
+Contains the representations and logic for mappings of operators to machines/devices/GPUs.
+
+Core functionality includes:
+- \ref FlexFlow::MachineView "MachineView": the compiler-side representation of a mapping.
+  For the runtime-side representation, see \ref FlexFlow::MappedOperatorTaskGroup
+- \ref allowed_machine_views.h
+
+*/
diff --git a/lib/compiler/include/compiler/unity_algorithm/index.dox b/lib/compiler/include/compiler/unity_algorithm/index.dox
new file mode 100644
index 0000000000..97d0865502
--- /dev/null
+++ b/lib/compiler/include/compiler/unity_algorithm/index.dox
@@ -0,0 +1,8 @@
+/**
+
+@page unity-dp-algorithm Unity DP Algorithm
+
+\see
+  https://www.lockshaw.net/static/unity.pdf
+
+*/
diff --git a/lib/compiler/index.dox b/lib/compiler/index.dox
new file mode 100644
index 0000000000..bcf048cb1d
--- /dev/null
+++ b/lib/compiler/index.dox
@@ -0,0 +1,12 @@
+/**
+
+@page compiler compiler
+
+Implements the core logic of the compiler.
+This includes:
+
+- \subpage unity-dp-algorithm "Unity DP Algorithm"
+- \subpage task-graph-simulator "Task Graph Simulator"
+- \subpage machine-mapping "Machine Mapping"
+
+*/
diff --git a/lib/index.dox b/lib/index.dox
new file mode 100644
index 0000000000..618f53104e
--- /dev/null
+++ b/lib/index.dox
@@ -0,0 +1,29 @@
+/**
+
+\mainpage lib
+
+This directory contains the core C++ code that underlies %FlexFlow, organized into the following libraries:
+
+- \subpage compiler "":
+- \subpage kernels "":
+- \subpage op-attrs "":
+- \subpage pcg "": Contains the definitions of \ref FlexFlow::ComputationGraph and
+  \ref FlexFlow::ParallelComputationGraph, as well as code for serializing and deserializing
+  both graphs (\ref file-format)
+- \subpage substitutions "substitutions": Contains the definitions of pcg substitutions
+  (i.e., FlexFlow::Substitution), as well as the code for serializing them
+- \subpage utils "": Various utility and support libraries for the rest of the
+  project. Particularly useful are @ref "utils-graph", @ref "utils-containers",
+  and @ref "utils-cli".
+- \subpage models "":
+- \subpage task-spec "":
+- \subpage local-execution "":
+- \subpage realm-execution "":
+
+\section Deprecated
+
+- \c "local-pcg-execution":
+- \c "ffi":
+- \c "substitution-generator":
+- \c "runtime": Out-of-date code migrated from the old %FlexFlow codebase. Currently kept around for reference, but will eventually be removed.
+*/
diff --git a/lib/kernels/index.dox b/lib/kernels/index.dox
new file mode 100644
index 0000000000..8ca8ad33ea
--- /dev/null
+++ b/lib/kernels/index.dox
@@ -0,0 +1,5 @@
+/**
+
+\page kernels kernels
+
+*/
diff --git a/lib/local-execution/include/local-execution/README.md b/lib/local-execution/include/local-execution/README.md
deleted file mode 100644
index cc68162afc..0000000000
--- a/lib/local-execution/include/local-execution/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-The primary external-facing interface of local-execution.
-
-Major components:
-
-* `computation_graph_instance.h`: is the main external facing interface
-  * Takes a `ComputationGraph` as input, expands and initializes it
-  * Provides various methods to run all or a subset of passes
-* `local_task_registry.h`: functions to retrieve task implementations
-  * Not a dynamic registry: tasks are all static now
-* `local_task_argument_accessor.h`: local wrapper for `ITaskArgumentAccessor`
-  * Stores all of the necessary data required for a task to execute
-* `task_execution.h`: utilities to prepare and execute tasks
-* `tensor_allocation.h`: a pass for the dataflow graph that allocates all tensors
diff --git a/lib/local-execution/index.dox b/lib/local-execution/index.dox
new file mode 100644
index 0000000000..aeaf73fc0f
--- /dev/null
+++ b/lib/local-execution/index.dox
@@ -0,0 +1,19 @@
+/**
+
+\page local-execution local-execution
+
+The primary external-facing interface of local-execution.
+
+Major components:
+
+- \ref "computation_graph_instance.h": is the main external facing interface
+  - Takes a FlexFlow::ComputationGraph as input, expands and initializes it
+  - Provides various methods to run all or a subset of passes
+- \ref "local_task_registry.h": functions to retrieve task implementations
+  - Not a dynamic registry: tasks are all static now
+- \ref "local_task_argument_accessor.h": local wrapper for FlexFlow::ITaskArgumentAccessor
+  - Stores all of the necessary data required for a task to execute
+- \ref "task_execution.h": utilities to prepare and execute tasks
+- \ref "tensor_allocation.h": a pass for the dataflow graph that allocates all tensors
+
+*/
diff --git a/lib/models/include/models/bert/bert.h b/lib/models/include/models/bert/bert.h
index 0047996b78..51c5a694c9 100644
--- a/lib/models/include/models/bert/bert.h
+++ b/lib/models/include/models/bert/bert.h
@@ -31,10 +31,10 @@ BertConfig get_default_bert_config();
  *
  * @note This is a plain encoder-only model for pre-training.
  *
- * @param BertConfig The config of BERT model.
+ * @param config The config of BERT model.
  * @return ComputationGraph The computation graph of a BERT model.
  */
-ComputationGraph get_bert_computation_graph(BertConfig const &);
+ComputationGraph get_bert_computation_graph(BertConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/include/models/candle_uno/candle_uno.h b/lib/models/include/models/candle_uno/candle_uno.h
index a2d21f2830..bee398b71d 100644
--- a/lib/models/include/models/candle_uno/candle_uno.h
+++ b/lib/models/include/models/candle_uno/candle_uno.h
@@ -31,10 +31,10 @@ CandleUnoConfig get_default_candle_uno_config();
  * map from specific data identifier in the dataset to the feature name used in
  * this model.
  *
- * @param CandleUnoConfig The config of the Candle Uno model.
- * @return ComputationGraph The PCG of a Transformer model.
+ * @param config The config of the Candle Uno model.
+ * @return The PCG of a Transformer model.
  */
-ComputationGraph get_candle_uno_computation_graph(CandleUnoConfig const &);
+ComputationGraph get_candle_uno_computation_graph(CandleUnoConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/include/models/dlrm/dlrm.h b/lib/models/include/models/dlrm/dlrm.h
index c3443f3b9b..481f02957d 100644
--- a/lib/models/include/models/dlrm/dlrm.h
+++ b/lib/models/include/models/dlrm/dlrm.h
@@ -48,8 +48,8 @@ tensor_guid_t create_dlrm_interact_features(
 /**
  * @brief Get the DLRM computation graph.
  *
- * @param DLRMConfig The config of DLRM model.
- * @return ComputationGraph The computation graph of a DLRM model.
+ * @param config The config of DLRM model.
+ * @return The computation graph of a DLRM model.
  */
 ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config);
 
diff --git a/lib/models/include/models/transformer/transformer.h b/lib/models/include/models/transformer/transformer.h
index 385100a4c9..da6bfc3a9d 100644
--- a/lib/models/include/models/transformer/transformer.h
+++ b/lib/models/include/models/transformer/transformer.h
@@ -37,10 +37,10 @@ TransformerConfig get_default_transformer_config();
 /**
  * @brief Get the Transformer computation graph.
  *
- * @param TransformerConfig The config of Transformer model.
- * @return ComputationGraph The PCG of a Transformer model.
+ * @param config The config of Transformer model.
+ * @return The PCG of a Transformer model.
  */
-ComputationGraph get_transformer_computation_graph(TransformerConfig const &);
+ComputationGraph get_transformer_computation_graph(TransformerConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/index.dox b/lib/models/index.dox
new file mode 100644
index 0000000000..5ca79b3b34
--- /dev/null
+++ b/lib/models/index.dox
@@ -0,0 +1,15 @@
+/**
+
+\page models models
+
+\section real-models Real Models
+- \subpage bert "BERT"
+- \subpage candle-uno "Candle UNO"
+- \subpage dlrm "DLRM"
+- \subpage inception-v3 "Inception v3"
+- \subpage transformer "Transformer"
+
+\section test-models Artificial Models for Testing
+- \subpage split-test
+
+*/
diff --git a/lib/models/src/models/split_test/split_test.cc b/lib/models/src/models/split_test/split_test.cc
index 67d2f74ce0..a091ba3ce7 100644
--- a/lib/models/src/models/split_test/split_test.cc
+++ b/lib/models/src/models/split_test/split_test.cc
@@ -5,6 +5,7 @@
 namespace FlexFlow {
 
 ComputationGraph get_split_test_computation_graph(positive_int batch_size) {
+//! [ComputationGraphBuilder example]
   ComputationGraphBuilder cgb;
 
   positive_int layer_dim1 = 256_p;
@@ -34,6 +35,7 @@ ComputationGraph get_split_test_computation_graph(positive_int batch_size) {
   t = cgb.softmax(t);
 
   return cgb.computation_graph;
+//! [ComputationGraphBuilder example]
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/ops/index.dox b/lib/op-attrs/include/op-attrs/ops/index.dox
new file mode 100644
index 0000000000..4881221743
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ops/index.dox
@@ -0,0 +1,5 @@
+/**
+
+\page op-attrs-ops "Operator Descriptions"
+
+*/
diff --git a/lib/op-attrs/index.dox b/lib/op-attrs/index.dox
new file mode 100644
index 0000000000..6d3d6d8f60
--- /dev/null
+++ b/lib/op-attrs/index.dox
@@ -0,0 +1,17 @@
+/**
+
+\page op-attrs op-attrs
+
+Contains the compiler-side definition of all of the operators and associated functions for reasoning about their behavior, as well as the fundamental concepts needed to represent them.
+Key pieces include:
+
+- Representing tensors in the compiler:
+  \ref FlexFlow::TensorShape, \ref FlexFlow::TensorDims
+- Representing parallel/sharded/distributed tensors in the compiler:
+  \ref FlexFlow::ParallelTensorShape, \ref FlexFlow::ParallelTensorDimDegrees
+- The actual operator definitions: \subpage op-attrs-ops "ops/"
+- Computing data dependencies of operators computing over parallel tensors:
+  \ref get_operator_to_parallel_tensor_space_mappings.h
+
+
+*/
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 064a4dd20d..ddc9ee312a 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -6,6 +6,12 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief A helper interface for building ComputationGraph in a pytorch (i.e., weight-implicit) style.
+ *
+ * For an example of how to use it, see the following code from \ref models:
+ * \snippet lib/models/src/models/split_test/split_test.cc ComputationGraphBuilder example
+ */
 struct ComputationGraphBuilder {
 public:
   ComputationGraphBuilder();
diff --git a/lib/pcg/include/pcg/file_format/v1/index.dox b/lib/pcg/include/pcg/file_format/v1/index.dox
new file mode 100644
index 0000000000..e6d0d4be4f
--- /dev/null
+++ b/lib/pcg/include/pcg/file_format/v1/index.dox
@@ -0,0 +1,5 @@
+/**
+
+@page file-format pcg/file_format/v1
+
+*/
diff --git a/lib/pcg/index.dox b/lib/pcg/index.dox
new file mode 100644
index 0000000000..721c2ba062
--- /dev/null
+++ b/lib/pcg/index.dox
@@ -0,0 +1,23 @@
+/**
+
+\page pcg pcg
+
+Defines the top-level datastructures and their serialization formats, along with some helper interfaces for constructing and manipulating them.
+
+\section pcg-datastructures Key Datastructures
+
+- \ref FlexFlow::ComputationGraph "ComputationGraph": aka CG
+- \ref FlexFlow::ParallelComputationGraph "ParallelComputationGraph": aka PCG
+- \ref FlexFlow::MappedParallelComputationGraph "MappedParallelComputationGraph": aka MPCG
+
+\section serialization-formats Serialization
+
+- \subpage file-format "pcg/file_format"
+
+\section pcg-helpers Helper Functionality
+
+- \ref FlexFlow::ComputationGraphBuilder "ComputationGraphBuilder"
+- \ref FlexFlow::ParallelComputationGraphBuilder "ParallelComputationGraphBuilder"
+
+
+*/
diff --git a/lib/realm-execution/README.md b/lib/realm-execution/README.md
deleted file mode 100644
index 1454c7eac8..0000000000
--- a/lib/realm-execution/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-The Realm backend for distributed execution.
-
-This is a single-controller implementation. That means the controller (the task that launches all other work) runs on a single node and remotely launches work onto other nodes. Aside from caveats mentioned below, this implementation is (mostly) capable of distributed execution.
-
-Major components:
-
-* `PCGInstance`: the main public interface for the Realm backend. It takes a mapped PCG and lowers it through the dynamic graph to get the fully-specified execution order of tasks to be executed. Besides the usual dynamic graph passes (pass expansion, update insertion, shard expansion), this class also tracks the allocation of Realm instances for tensors.
-* `RealmManager`: manages the initialization and shutdown of the Realm runtime. Provides the interface to launch the controller that runs the rest of the computation.
-* `RealmContext`: an interface that wraps the rest of Realm and protects against certain classes of bugs, such as shutdown bugs. **Do NOT call Realm directly unless you know what you are doing.**
-* `tasks/`: the Realm task implementations and their supporting infrastructure.
-  * `impl/`: the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
-  * `serializer/`: additional support for serializing Realm data types.
-  * `realm_task_registry.h`: manages the registration of Realm tasks. All Realm tasks go through this interface.
-  * `task_id_t.h` and `realm_task_id_t.h`: types to represent Realm tasks, along with an encoding to Realm's native task ID type.
-
-Other components used mainly within `PCGInstance`:
-
- * `DistributedDeviceHandle`: represents a distributed device handle (i.e., device handles on all the GPUs on the system), for convenience.
- * `DependenceSet`: tracks dependencies during execution of tasks.
- * `distributed_device_state_initialization.h`: performs device state initialization of dynamic graph nodes and returns the resulting `PerDeviceOpStateBacking`.
- * `instance_allocation.h`: allocates instances for tensors in the dynamic graph and returns the resulting `TensorInstanceBacking`.
-
-TODO list:
-
-* external instances
-* copies
-* task fusion
-* parallel operator implementation (partition, reduce, gather, etc.)
-* and fused parallel operators (reduce + broadcast = allreduce)
-* memory-optimizing compiler integration (tensor creation/destruction, tensor reuse)
-* control replication
-* Realm subgraphs
diff --git a/lib/realm-execution/include/realm-execution/dependency_set.h b/lib/realm-execution/include/realm-execution/dependency_set.h
index 629a40e2e7..bd6ab04cea 100644
--- a/lib/realm-execution/include/realm-execution/dependency_set.h
+++ b/lib/realm-execution/include/realm-execution/dependency_set.h
@@ -8,6 +8,9 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief Tracks dependencies during execution of tasks.
+ */
 struct DependencySet {
 public:
   DependencySet() = delete;
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
index db338e4e4b..b795d53d56 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -23,11 +23,22 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief The main public interface for the realm backend.
+ *
+ * It takes a MappedParallelComputationGraph and lowers it through the
+ * DynamicOpenDataflowGraph to get the fully-specified execution order of tasks
+ * to be executed. Besides the usual dynamic graph passes (\ref
+ * perform_pass_expansion, \ref perform_update_insertion, \ref
+ * perform_shard_expansion), this class also tracks the allocation of realm
+ * instances for tensors.
+ */
 struct PCGInstance {
 public:
   PCGInstance() = delete;
   PCGInstance(PCGInstance const &) = delete;
   PCGInstance(PCGInstance &&) = delete;
+
   explicit PCGInstance(
       RealmContext &ctx,
       std::vector<DynamicNodeInvocation> const &execution_order,
@@ -35,7 +46,9 @@ struct PCGInstance {
       PerDeviceOpStateBacking const &device_state_backing,
       OptimizerAttrs const &optimizer_attrs,
       std::optional<Realm::RegionInstance> logit_grad_tensor);
+
   ~PCGInstance();
+
   RealmContext &get_realm_context();
   std::vector<DynamicNodeInvocation> const &get_execution_order() const;
   TensorInstanceBacking const &get_tensor_instance_backing() const;
@@ -43,7 +56,6 @@ struct PCGInstance {
   OptimizerAttrs const &get_optimizer_attrs() const;
   void update_optimizer_attrs_for_next_iter();
   std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
-
 private:
   RealmContext &ctx;
   std::vector<DynamicNodeInvocation> execution_order;
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index b018a04a87..25bbecfb82 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -14,6 +14,12 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief An interface that wraps the rest of realm and protects against certain
+ * classes of bugs, such as shutdown bugs.
+ *
+ * @note Do NOT call Realm directly unless you know what you are doing.
+ */
 struct RealmContext {
 public:
   RealmContext(Realm::Processor processor);
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 8a79476bcf..5c673d4134 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -9,6 +9,12 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief Manages the initialization and shutdown of the realm runtime.
+ *
+ * Provides the interface to launch the controller that runs the rest of the computation
+ * (i.e., \ref start_controller).
+ */
 struct RealmManager : private RealmContext {
 public:
   RealmManager(int *argc, char ***argv);
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 8399742424..e089756741 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -18,6 +18,32 @@ namespace FlexFlow {
 
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
+/**
+ * @brief Launches the task for a DynamicNodeInvocation using realm.
+ *
+ * @note The task launch process functions a bit differently to that used in the
+ * previous FlexFlow codebase. Rather than having a function registered with
+ * realm/legion for every task_id_t, we now have only a few functions
+ * registered: @ref op_task_body, @ref device_handle_init_task_body,
+ * @ref device_state_init_return_task_body, and @ref controller_task_body (see
+ * @ref register_all_tasks for where this list comes from), and in fact only
+ * @ref op_task_body is launched by @ref spawn_op_task. Each of these registered
+ * tasks use the serialized arguments sent to them to dispatch to the correct
+ * implementatin in task-spec: for example, if we are trying to launch the task
+ * for a Conv2d operator, this function will actually dispatch a call to @ref
+ * op_task_body with a serialized OpTaskArgs as an argument, and then @ref
+ * op_task_body will deserialize the argument, determine that we are trying to
+ * launch the forward pass of Conv2d, use @ref execute_dynamic_node_invocation
+ * (which then uses @ref call_fwd_task_impl) to actually call the function in
+ * lib/task-spec/src/task-spec/ops/impl/conv_2d.cc
+ *
+ * @note That the above also means that we don't have a separate
+ * ITaskArgumentAccessor subclass for realm-execution. Instead we ship over the
+ * information on the corresponding realm instances over to the remote node,
+ * grab the corresponding pointer/GenericTensorAccessor, and then use
+ * LocalTaskArgumentAccessor for the actual argument access as, by this point,
+ * everything is local.
+ */
 Realm::Event spawn_op_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
index 97b19b5f51..d0abb95f5a 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
@@ -7,6 +7,19 @@ features = [
   "rapidcheck",
   "json",
 ]
+docstring = """
+@brief An enum for identifying tasks for use in the realm runtime.
+
+@note Many of these are pulled over from the old FlexFlow codebase and are no
+longer in use. Eventually these should be pruned down to the set of tasks we're
+actually using.
+
+@note @ref task_id_t is used by the realm runtime (i.e., `realm-execution`),
+but not by realm directly: realm-execution uses @ref
+get_realm_task_id_for_task_id to convert every @ref task_id_t into a
+Realm::Processor::TaskFuncID, which is what is actually used for task launches,
+etc.
+"""
 
 [[values]]
 name = "CONTROLLER_TASK_ID"
diff --git a/lib/realm-execution/index.dox b/lib/realm-execution/index.dox
new file mode 100644
index 0000000000..a225ef5b30
--- /dev/null
+++ b/lib/realm-execution/index.dox
@@ -0,0 +1,38 @@
+/**
+
+\page realm-execution realm-execution
+
+The %Realm backend for distributed execution.
+
+This is a single-controller implementation. That means the controller (the task that launches all other work) runs on a single node and remotely launches work onto other nodes. Aside from caveats mentioned below, this implementation is (mostly) capable of distributed execution.
+
+\section realm-execution-major-components Major Components
+
+- \ref "FlexFlow::PCGInstance": the main public interface for the Realm backend. It takes a mapped PCG and lowers it through the dynamic graph to get the fully-specified execution order of tasks to be executed. Besides the usual dynamic graph passes (pass expansion, update insertion, shard expansion), this class also tracks the allocation of Realm instances for tensors.
+- \ref "FlexFlow::RealmManager": manages the initialization and shutdown of the Realm runtime. Provides the interface to launch the controller that runs the rest of the computation.
+- \ref "FlexFlow::RealmContext": an interface that wraps the rest of Realm and protects against certain classes of bugs, such as shutdown bugs. **Do NOT call Realm directly unless you know what you are doing.**
+- @ref "include/realm-execution/tasks": the Realm task implementations and their supporting infrastructure.
+  - @ref "lib/realm-execution/include/realm-execution/tasks/impl" "impl/": the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
+  - @ref "lib/realm-execution/include/realm-exectuion/tasks/serializer/" "serializer/": additional support for serializing Realm data types.
+  - @ref realm_task_registry.h: manages the registration of %Realm tasks. All %Realm tasks go through this interface.
+  - @ref task_id_t.h and @ref realm_task_id_t.h: types to represent %Realm tasks, along with an encoding to %Realm's native task ID type.
+
+\section realm-execution-other-components Other components used mainly within \ref FlexFlow::PCGInstance
+
+ - @ref "::FlexFlow::DistributedDeviceHandle": represents a distributed device handle (i.e., device handles on all the GPUs on the system), for convenience.
+ - @ref "::FlexFlow::DependencySet": tracks dependencies during execution of tasks.
+ - @ref "distributed_device_state_initialization.h": performs device state initialization of dynamic graph nodes and returns the resulting FlexFlow::PerDeviceOpStateBacking.
+ - @ref "instance_allocation.h": allocates instances for tensors in the dynamic graph and returns the resulting FlexFlow::TensorInstanceBacking.
+
+\section realm-execution-todo TODO
+
+- external instances
+- copies
+- task fusion
+- parallel operator implementation (partition, reduce, gather, etc.)
+- and fused parallel operators (reduce + broadcast = allreduce)
+- memory-optimizing compiler integration (tensor creation/destruction, tensor reuse)
+- control replication
+- Realm subgraphs
+
+*/
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
index b806aa1277..87460cc5a7 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
@@ -61,19 +61,20 @@ Realm::Event spawn_device_handle_init_task(
     bool allowTensorOpMathConversion,
     DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
     Realm::Event precondition) {
-  DeviceHandleInitTaskArgs task_args{
+
+  DeviceHandleInitTaskArgs task_args = DeviceHandleInitTaskArgs{
       workSpaceSize,
       allowTensorOpMathConversion,
       ctx.get_current_processor(),
       result_ptr,
   };
 
-  std::string args = serialize_task_args(
+  std::string serialized_args = serialize_task_args(
       device_handle_init_task_args_to_serializable(task_args));
   return ctx.spawn_task(target_proc,
                         task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
-                        args.data(),
-                        args.size(),
+                        serialized_args.data(),
+                        serialized_args.size(),
                         Realm::ProfilingRequestSet{},
                         precondition);
 }
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index c7dcdb39c2..e86574c9b9 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -72,20 +72,25 @@ Realm::Event spawn_op_task(
     FFIterationConfig const &iteration_config,
     std::optional<OptimizerAttrs> const &optimizer_attrs,
     Realm::Event precondition) {
-  OpTaskArgs task_args{invocation,
-                       tensor_backing,
-                       device_state,
-                       profiling_settings,
-                       device_handle,
-                       iteration_config,
-                       optimizer_attrs};
-  std::string args =
+
+  OpTaskArgs task_args = OpTaskArgs{
+    invocation,
+    tensor_backing,
+    device_state,
+    profiling_settings,
+    device_handle,
+    iteration_config,
+    optimizer_attrs,
+  };
+
+  std::string serialized_args =
       serialize_task_args(op_task_args_to_serializable(task_args));
+
   return ctx.spawn_task(
       target_proc,
       assert_unwrap(get_task_id_for_op(invocation.node_attrs, optimizer_attrs)),
-      args.data(),
-      args.size(),
+      serialized_args.data(),
+      serialized_args.size(),
       Realm::ProfilingRequestSet{},
       precondition);
 }
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index 09d99655c0..6cc4ff4d02 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -20,7 +20,7 @@ Realm::Event register_task(Realm::Processor::Kind target_kind,
   Realm::Processor::TaskFuncID realm_task_id =
       get_realm_task_id_for_task_id(func_id);
 #ifdef FF_USE_PREALM
-  Realm::prealm_task_name(realm_task_id, fmt::format("{}", func_id));
+  Realm::prealm_task_name(realm_task_id, fmt::to_string(func_id));
 #endif
   return Realm::Processor::register_task_by_kind(
       target_kind,
diff --git a/lib/runtime/src/machine_model.cc b/lib/runtime/src/machine_model.cc
index 73060e9b1a..ba16da9561 100644
--- a/lib/runtime/src/machine_model.cc
+++ b/lib/runtime/src/machine_model.cc
@@ -7,7 +7,7 @@
 namespace FlexFlow {
 
 /// @param[in] nb_elements : size of your for loop
-/// @param[in] functor(start, end) :
+/// @param[in] functor :
 /// your function processing a sub chunk of the for loop.
 /// "start" is the first index to process (included) until the index "end"
 /// (excluded)
diff --git a/lib/runtime/src/task_spec/README.md b/lib/runtime/src/task_spec/README.md
deleted file mode 100644
index 0884e62b4d..0000000000
--- a/lib/runtime/src/task_spec/README.md
+++ /dev/null
@@ -1,176 +0,0 @@
-# task\_spec
-
-The `task_spec` interface provides an easy-to-use, high-level, and safe abstraction on top of Legion tasks.
-While not all Legion features are supported, the `task_spec` interface is capable of expressing all Legion usages in FlexFlow.
-Using `task_spec` is not mandatory (Legion still works fine, as everything simply compiles down to Legion `TaskLauncher`, etc. 
-anyway), but any code that can use `task_spec` is strongly advised to use it as it is significantly less verbose, safer, and 
-prevents common errors.
-
-The `task_spec` code consists of two parts: `TaskSignature` ([task\_signature.h](./task_signature.h)) and `TaskInvocation` ([task\_invocation.h](./task_invocation.h)), 
-which can be intuitively understood as function signatures and function calls in a typical programming language.
-`TaskSignature`s define a set of _slots_ of two kinds: 
-each can be either a _tensor slot_, which represents a parallel tensor whose Legion region will be passed to the underlying task, 
-or an _argument slot_, which can be used to pass small[^1] values of arbitrary[^2] type via `Legion::TaskArgument`.
-
-As with function signatures/calls, each task has a single `TaskSignature` but can have multiple `TaskInvocation`s.
-`TaskSignature`s are registered for `task_id_t`s via the `register_task` function, which is usually called by specializations of `template <task_id_t> register_task` 
-defined in the relevant file (e.g., [optimizer.h](../optimizer.h) and [optimizer.cc](../optimizer.cc)), which are ultimately called by 
-`register_flexflow_internal_tasks` in [tasks.cc](../tasks.cc).
-
-To execute a pair of a `TaskSignature` and a `TaskInvocation`, they must be compiled/translated/lowered to a call to a `Legion::TaskLauncher` or a 
-`Legion::IndexTaskLauncher`.
-Ideally this would simply be done in a single step, but in practice the ability to specify `TaskInvocation`s at different layers of abstraction can 
-be very useful.
-Thus, what we previously referred to as `TaskInvocation` is actually logically the following set of classes:
-
-```mermaid
-flowchart TD
-    A[OpTaskInvocation]
-    B[TaskInvocation]
-    C[ExecutableTaskInvocation]
-    D[TensorlessTaskInvocation]
-    E[IndexTaskInvocation]
-    F[Legion::TaskLauncher]
-    G[Legion::IndexTaskLauncher]
-    H[ExecutableIndexTaskInvocation]
-    I[TensorlessIndexTaskInvocation]
-    A -->|compiles down to| E
-    E -->|compiles down to| H
-    H -->|compiles down to| I 
-    I -->|compiles down to| G
-
-    B -->|compiles down to| C
-    C -->|compiles down to| D
-    D -->|compiles down to| F
-```
-Similarly, `TaskSignature` is actually divided up into `OpTaskSignature` and `TaskSignature`.
-The flow of full compilation process is as follows:
-```mermaid
-%%{init: { 'themeVariables': { 'fontFamily': 'monospace' }, 'flowchart': { 'curve': 'bumpY', 'defaultRenderer': 'elk' }, 'theme': 'default' } }%%
-flowchart TD
-    A[OpTaskInvocation]
-    B[TaskInvocation]
-    C[ExecutableTaskInvocation]
-    D[TensorlessTaskInvocation]
-    E[IndexTaskInvocation]
-    F[Legion::TaskLauncher]
-    G[Legion::IndexTaskLauncher]
-    H[ExecutableIndexTaskInvocation]
-    I[TensorlessIndexTaskInvocation]
-    J[OpTaskSignature]
-    K[TaskSignature]
-    L[ConcreteArgsFormat]
-    M[FutureArgsFormat]
-    N[TensorArgsFormat]
-    O[IndexArgsFormat]
-    P[TaskArgumentsFormat]
-    Q[Legion::TaskArgument]
-    R[Legion::ArgumentMap]
-    S[TaskReturnAccessor]
-    T[IndexTaskReturnAccessor]
-    AA[task_id_t]
-    AC[TensorlessTaskBinding]
-    AD[TensorlessIndexTaskBinding]
-    AE[task_impl function]
-    AF[task function]
-    AG[Legion::Task]
-    AH["std::vector<Legion::PhysicalRegion>"]
-    AI[Legion::Context]
-    AJ[Legion::Runtime]
-    AK[TaskArgumentAccessor]
-    AL[add_region_requirement]
-
-    A -->|compiles to| E
-    E -->|compiles to| H
-    H -->|compiles to| N
-    N -->|compiles to| P
-    N -->|invokes| AL
-    AL -->|on| G
-    H -->|compiles to| I
-    I -->|has member| AA
-    I -->|has member| AD 
-    AD -->|compiles to| M
-    AD -->|compiles to| O
-    AD -->|compiles to| L
-    O -->|compiles to| R
-    O -->|compiles to| P
-    M -->|compiles to| P
-    L -->|compiles to| P
-    M -->|compiles to| Q
-    O -->|compiles to| Q
-    L -->|compiles to| Q
-    P -->|compiles to| Q
-    Q -->|passed to| G
-    R -->|passed to| G
-    G -->|generates a| AG
-    G -->|generates a| AH
-    G -->|generates a| AI 
-    G -->|generates a| AJ
-    AG -->|passed to| AF
-    AH -->|passed to| AF
-    AI -->|passed to| AF 
-    AJ -->|passed to| AF
-    AF -->|generates a| AK
-    AK -->|passed to| AE
-    AE -->|possibly generates a| S
-    G -->|possibly generates a| S
-    K -->|possibly generates a| S
-
-    B -->|compiles to| C
-    C -->|compiles to| N
-    C -->|compiles to| D
-    D -->|has member| AA
-    D -->|has member| AC
-    AC -->|compiles to| L 
-    AC -->|compiles to| M
-    L -->|compiles to| P
-    M -->|compiles to| P 
-    L -->|compiles to| Q
-    M -->|compiles to| Q
-    P -->|compiles to| Q
-    Q -->|passed to| F
-    AL -->|on| F
-    F -->|generates a| AG
-    F -->|generates a| AH
-    F -->|generates a| AI 
-    F -->|generates a| AJ
-    AE -->|possibly generates a| T
-    G -->|possibly generates a| T
-    K -->|possibly generates a| T
-
-    J -->|compiles to| K
-```
-
-The primary difference between the different `TaskInvocation` types is which argument types they support.
-The full list of argument types is:
-- tensor slots
-  - `OpTensorSpec`: a reference to a input, output, or weight tensor attched to the given operator. 
-  - `ParallelTensorSpec`: a reference (via `parallel_tensor_guid_t`) to a parallel tensor somewhere in the PCG.
-- argument slots
-  - `OpArgRefSpec`: an argument that should be filled in during the compilation process from `OpTaskInvocation` to `TaskInvocation`. For those familiar with `Reader` monads, this is roughly analogous
-  - `ConcreteArgSpec`: a concrete value
-  - `IndexArgSpec`: a set of concrete values, each of which should be sent to a different Index Task
-  - `CheckedTypedFuture`: a legion future whose value should be passed into the task
-  - `CheckedTypedFutureMap`: a set of legion futures, each of which should have its value sent to a different Index Task (conceptually, `IndexArgSpec` + `CheckedTypedFuture`)
-  - `ArgRefSpec`: an argument that should be filled in during the compilation process from `TaskInvocation` to `ExecutableTaskInvocation`. For those familiar with `Reader` monads, this is roughly analogous
-  - `TaskInvocationSpec`: a nested task invocation which should be launched and have its resulting `Future` passed into the given task
-  - `IndexTaskInvocationSpec`: (currently not implemented, may or may not be necessary)
-
-The supported argument types for each invocation type are:
-- `OpTaskInvocation`
-  - `OpTensorSpec`, `OpArgRefSpec`, `ConcreteArgSpec`, `IndexArgSpec`, `CheckedTypedFuture`, `CheckedTypedFutureMap`, `ArgRefSpec`, `TaskInvocationSpec`, `IndexTaskInvocationSpec`
-- `TaskInvocation`
-  - `ParallelTensorSpec`, `ConcreteArgSpec`, `CheckedTypedFuture`, `ArgRefSpec`, `TaskInvocationSpec`
-- `IndexTaskInvocation`
-  - `ParallelTensorSpec`, `ConcreteArgSpec`, `IndexArgSpec`, `CheckedTypedFuture`, `CheckedTypedFutureMap`, `ArgRefSpec`, `TaskInvocationSpec`, `IndexTaskInvocationSpec`
-- `ExecutableTaskInvocation`
-  - `ParallelTensorSpec`, `ConcreteArgSpec`, `CheckedTypedFuture`, `TaskInvocationSpec`
-- `ExecutableIndexTaskInvocation`
-  - `ParallelTensorSpec`, `ConcreteArgSpec`, `IndexArgSpec`, `CheckedTypedFuture`, `CheckedTypedFutureMap`, `TaskInvocationSpec`, `IndexTaskInvocationSpec`
-- `TensorlessTaskInvocation`
-  - `ConcreteArgSpec`, `CheckedTypedFuture`, `TaskInvocationSpec`
-- `TensorlessIndexTaskInvocation`
-  - `ConcreteArgSpec`, `IndexArgSpec`, `CheckedTypedFuture`, `CheckedTypedFutureMap`, `TaskInvocationSpec`, `IndexTaskInvocationSpec`
-
-[^1]: i.e., not tensor-sized
-[^2]: Types must either be serializable ([serialization.h](../serialization.h)) or device-specific ([device\_specific\_arg.h](./device-specific-arg.h))
diff --git a/lib/substitutions/README.md b/lib/substitutions/README.md
deleted file mode 100644
index e9db4c6aab..0000000000
--- a/lib/substitutions/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# substitutions
-
-## Substitution
-
-A substitution is to replace a subgraph of the PCG by a new one. We refer to the subgraph to be replaced as the input graph, and the new subgraph to replace the input graph as the output graph.
-
-A `Substitution` object describes a substitution. It consists of
-* An `input_graph` of type `GraphPattern` that describes which kind of input graphs the substitution can be applied to;
-* An `output_graph` of type `OutputGraphExpr` that describes how the output graph is computed from the input graph; and
-* An `input_mapping` and `output_maping` that describes how the output graph is connected to the original PCG.
-
-### GraphPattern and MultiDiGraphPatternMatch
-
-A `GraphPattern` is defined as an open graph with node label `OperatorPattern` and output label `ParallelTensorPattern`, which is refered to as the pattern graph. The graph structure of a `GraphPattern` instance defines the geometrical property of the input graph, while the node labels and output labels define the attribute property of that.
-
-To apply a substitution to a PCG, we should first match the pattern graph to a subgraph of the PCG. `MultiDiGraphPatternMatch` describes the match, which consists of
-* `node_assignment`: a mapping from the nodes of the pattern graph to the nodes of the PCG; and
-* `edge_assignment`: a mapping from the edges of the pattern graph to the nodes of the PCG.
-The input graph derived by this match is then defined by `values(node_assignment)` and `values(edge_assignment)`. A match is valid if and only if
-* `node_assignment` and `edge_assignment` are injections;
-* For every node `n` in the pattern graph, `edge_assignment` derives a bijection between `query_edges({n})` and `query_edges({node_assignment.at_l(n)})`.
-
-### OutputGraphExpr
-
-An `OutputGraphExpr` is defined as an open graph with node label `OperatorAttrAssignment` and output label `ParallelTensorAttrAssignment`, which defines how the operator attributes and the parallel tensor attributes of the output graph are derived from the input graph.
-
-`OperatorAttrAssignment` is a collection of `OperatorAttributeKey` and `GraphAttributeExpr` pairs. It defines how the attributes of a single operator is calculated from the input graph. A pair `{operator_attribute_key, graph_attribute_expr}` in the collection means the value of `graph_attribute_expr` is assigned to the attribute named `operator_attribute_key` of the operator.
-
-`ParallelTensorAttrAssignment` is defined in the similar way to `OperatorAttrAssignment`.
-
-`GraphAttributeExpr` is defined as one of `NodeAttrAccess`, `EdgeAttrAccess` and `AttrConstant`:
-* `NodeAttrAccess` consists of a node `node` and an expression `attr_expr` on the attributes of the operator associated with the node. The value of a `NodeAttrAccess` instance is the value of `attr_expr` evaluated on the operator associated with the node.
-* `EdgeAttrAccess` is defined in the similar way to `NodeAttrAccess`.
-* `AttrConstant` consists of a constant `value`. The value of an `AttrConstant` instance is `value`.
diff --git a/lib/substitutions/include/substitutions/substitution.dtg.toml b/lib/substitutions/include/substitutions/substitution.dtg.toml
index 5daeaceded..bd98efc71b 100644
--- a/lib/substitutions/include/substitutions/substitution.dtg.toml
+++ b/lib/substitutions/include/substitutions/substitution.dtg.toml
@@ -15,15 +15,27 @@ includes = [
 [[fields]]
 name = "pcg_pattern"
 type = "::FlexFlow::PCGPattern"
+docstring = """
+Describes which kind of input graphs the substitution can be applied to
+"""
 
 [[fields]]
 name = "output_graph_expr"
 type = "::FlexFlow::OutputGraphExpr"
+docstring = """
+Describes how the output graph is computed from the input graph
+"""
 
 [[fields]]
 name = "inputs_mapping"
 type = "::FlexFlow::bidict<::FlexFlow::PatternInput, ::FlexFlow::OutputGraphExprInput>"
+docstring = """
+Describes how the values matched by the pattern's inputs are connected to the original ParallelComputationGraph
+"""
 
 [[fields]]
 name = "outputs_mapping"
 type = "::FlexFlow::bidict<::FlexFlow::PatternNodeOutput, ::FlexFlow::OutputGraphExprNodeOutput>"
+docstring = """
+Describes how the values matched by the pattern's outputs are connected to the original ParallelComputationGraph
+"""
diff --git a/lib/substitutions/index.dox b/lib/substitutions/index.dox
new file mode 100644
index 0000000000..2c65c7362f
--- /dev/null
+++ b/lib/substitutions/index.dox
@@ -0,0 +1,26 @@
+/**
+
+\page substitutions substitutions
+
+\section substitution Substitution
+
+A \ref ::FlexFlow::Substitution is to replace a subgraph of the PCG by a new one. We refer to the subgraph to be replaced as the input graph, and the new subgraph to replace the input graph as the output graph.
+
+\section pattern-matches PCGPattern and MultiDiGraphPatternMatch
+
+A \ref ::FlexFlow::PCGPattern is defined as an open graph with node label ::FlexFlow::OperatorPattern` and output label `ParallelTensorPattern`, which is refered to as the pattern graph. The graph structure of a `GraphPattern` instance defines the geometrical property of the input graph, while the node labels and output labels define the attribute property of that.
+
+To apply a substitution to a PCG, we should first match the pattern graph to a subgraph of the PCG. `MultiDiGraphPatternMatch` describes the match, which consists of
+* `node_assignment`: a mapping from the nodes of the pattern graph to the nodes of the PCG; and
+* `edge_assignment`: a mapping from the edges of the pattern graph to the nodes of the PCG.
+The input graph derived by this match is then defined by `values(node_assignment)` and `values(edge_assignment)`. A match is valid if and only if
+* `node_assignment` and `edge_assignment` are injections;
+* For every node `n` in the pattern graph, `edge_assignment` derives a bijection between `query_edges({n})` and `query_edges({node_assignment.at_l(n)})`.
+
+\section output-graph-expr OutputGraphExpr
+
+An \ref ::FlexFlow::OutputGraphExpr is defined as an open graph with node label \ref ::FlexFlow::OutputOperatorAttrAssignment and output label \ref std::monostate.
+
+\ref ::FlexFlow::OutputOperatorAttrAssignment is a collection of \ref ::FlexFlow::OperatorAttributeKey and \ref ::FlexFlow::OutputOperatorAttributeExpr pairs. It defines how the attributes of a single operator is calculated from the input graph. A pair `{operator_attribute_key, output_operator_attribute_expr}` in the collection means the value of `output_operator_attribute_expr` is assigned to the attribute named `operator_attribute_key` of the operator.
+
+*/
diff --git a/lib/task-spec/index.dox b/lib/task-spec/index.dox
new file mode 100644
index 0000000000..bad8e8d5e1
--- /dev/null
+++ b/lib/task-spec/index.dox
@@ -0,0 +1,5 @@
+/**
+
+@page task-spec task-spec
+
+*/
diff --git a/lib/utils/README.md b/lib/utils/README.md
deleted file mode 100644
index a9c1ad3e88..0000000000
--- a/lib/utils/README.md
+++ /dev/null
@@ -1,449 +0,0 @@
-# utils
-
-## visitable
-
-[!WARNING]
-`visitable` is deprecated, new code should instead use `dtgen`
-
-### Motivation
-
-FlexFlow's codebase makes heavy use of "plain old data"[^2] types[^1] (referred to as _product types_ in the rest of this document) such as the following:
-```cpp
-struct Person {
-  std::string first_name;
-  std::string last_name;
-  int age;
-};
-```
-However, this standard implementation defines a set of behaviors that we, the FlexFlow developers, find undesirable:
-
-1. Partial constructibility: for many product types partial constructibility can make code bug-prone. For example, let us consider the following valid code:
-```cpp
-struct Person {
-  Person() = delete;
-
-  std::string first_name;
-  std::string last_name;
-  int age = 0;
-};
-
-Person p{"donald", "knuth"};
-```
-This code will compile just fine, but will silently use a nonsensical value of `age`. 
-Even worse, let us imagine that in the someone else adds an additional field `is_male`. 
-Unless they find and update every place in which `Person` is constructed, they will be left with the following code, which 
-compiles without errors but is (as of writing this) incorrect.
-```cpp
-struct Person {
-  Person() = delete;
-
-  std::string first_name;
-  std::string last_name;
-  int age = 0;
-  bool is_male = false;
-};
-
-Person p{"donald", "knuth", 85};
-```
-
-Not only can single fields be undefined/invalid, but whole structs can silently be filled with incorrect values if default constructibility is enabled:
-```cpp
-Person some_function() {
-  Person p;
-  if (...) {
-    p = {"donald", "knuth", 85};
-  }
-  return p;
-}
-```
-If the `if` branch is not taken, we will return a `Person` with nonsensical values, as there do not exist any values that naturally form a default.
-We could initalize the values as follows
-```cpp
-struct Person {
-  std::string first_name; // initializes to ""
-  std::string last_name; // initializes to ""
-  int age = 0;
-}
-```
-but this is a completely useless value, and if it shows up anywhere in our code it's probably a bug, since a nameless, age 0 person is probably not a helpful value to have.
-
-3. For product types, `operator==` and `operator!=` are trivial, but still have to be written and maintained, and can easily lead to bugs. For example, 
-```
-struct Person {
-  Person() = delete;
-  Person(std::string const &first_name, 
-         std::string const &last_name, 
-         int age)
-    : first_name(first_name),
-      last_name(last_name),
-      age(age),
-    { }
-
-  friend bool operator==(Person const &lhs, Person const &rhs) {
-    return lhs.first_name == rhs.first_name 
-      && lhs.last_name == rhs.last_name
-      && lhs.age == rhs.age;
-  }
-
-  friend bool operator!=(Person const &lhs, Person const &rhs) {
-    return lhs.first_name != rhs.first_name 
-      || lhs.last_name != rhs.last_name
-      || lhs.age != rhs.age;
-  }
-
-  std::string first_name;
-  std::string last_name;
-  int age;
-};
-```
-If we take the previous example of adding an additional `is_male` field to `Person`, it can be easy to miss a location, leading to incorrectness. 
-For example, we could quite easily end up with
-```cpp
-struct Person {
-  Person() = delete;
-  Person(std::string const &first_name, 
-         std::string const &last_name, 
-         int age,
-         bool is_male)
-    : first_name(first_name),
-      last_name(last_name),
-      age(age),
-      is_male(is_male)
-    { }
-
-  friend bool operator==(Person const &lhs, Person const &rhs) {
-    return lhs.first_name == rhs.first_name 
-      && lhs.last_name == rhs.last_name
-      && lhs.age == rhs.age
-      && lhs.is_male == rhs.is_male;
-  }
-
-  friend bool operator!=(Person const &lhs, Person const &rhs) {
-    return lhs.first_name != rhs.first_name 
-      || lhs.last_name != rhs.last_name
-      || lhs.age != rhs.age;
-      // oops, forgot to update with the new is_male field. Have fun debugging :P
-  }
-
-  std::string first_name;
-  std::string last_name;
-  int age;
-  bool is_male;
-};
-```
-and for product types with more fields this grows increasingly tedious to write and maintain. 
-
-4. Hashing: hashing product types is relatively trivial, as long as each of the fields is hashable. But again, we have to do a bunch of extra work to specify this, and this work has to be done for each product type in the codebase.
-(**Note:** from here on the examples are growing to grow increasingly long to emphasize the amount of code that needs to be written. Feel free to skim these longer code snippets if you trust the statement that implementing product types in vanilla C++ is tedious)
-```cpp
-struct Person {
-  Person() = delete;
-  Person(std::string const &first_name, 
-         std::string const &last_name, 
-         int age,
-         bool is_male)
-    : first_name(first_name),
-      last_name(last_name),
-      age(age),
-      is_male(is_male)
-    { }
-
-  friend bool operator==(Person const &lhs, Person const &rhs) {
-    return lhs.first_name == rhs.first_name 
-      && lhs.last_name == rhs.last_name
-      && lhs.age == rhs.age
-      && lhs.is_male == rhs.is_male;
-  }
-
-  friend bool operator!=(Person const &lhs, Person const &rhs) {
-    return lhs.first_name != rhs.first_name 
-      || lhs.last_name != rhs.last_name
-      || lhs.age != rhs.age
-      || lhs.is_male != rhs.is_male;
-  }
-
-  std::string first_name;
-  std::string last_name;
-  int age;
-  bool is_male;
-};
-
-// BEGIN new code
-namespace std {
-
-template <>
-struct hash<::Person> {
-  size_t operator()(::Person const &p) const {
-    size_t result = 0;
-    hash_combine(result, p.first_name);
-    hash_combine(result, p.last_name);
-    hash_combine(result, p.age);
-    hash_combine(result, p.is_male);
-  }
-};
-// END new code
-
-}
-```
-and if we also want to support `std::set<Person>` (which requires `Person` to be ordered), we also have to add `operator<`
-```cpp
-struct Person {
-  Person() = delete;
-  Person(std::string const &first_name, 
-         std::string const &last_name, 
-         int age,
-         bool is_male)
-    : first_name(first_name),
-      last_name(last_name),
-      age(age),
-      is_male(is_male)
-    { }
-
-  friend bool operator==(Person const &lhs, Person const &rhs) {
-    return lhs.first_name == rhs.first_name 
-      && lhs.last_name == rhs.last_name
-      && lhs.age == rhs.age
-      && lhs.is_male == rhs.is_male;
-  }
-
-  friend bool operator!=(Person const &lhs, Person const &rhs) {
-    return lhs.first_name != rhs.first_name 
-      || lhs.last_name != rhs.last_name
-      || lhs.age != rhs.age
-      || lhs.is_male != rhs.is_male;
-  }
-
-// BEGIN new code
-  friend bool operator<(Person const &lhs, Person const &rhs) {
-    return lhs.first_name < rhs.first_name 
-      || lhs.last_name < rhs.last_name
-      || lhs.age < rhs.age
-      || lhs.is_male < rhs.is_male;
-  }
-// END new code
-
-  std::string first_name;
-  std::string last_name;
-  int age;
-  bool is_male;
-};
-
-namespace std {
-
-template <>
-struct hash<::Person> {
-  size_t operator()(::Person const &p) const {
-    size_t result = 0;
-    hash_combine(result, p.first_name);
-    hash_combine(result, p.last_name);
-    hash_combine(result, p.age);
-    hash_combine(result, p.is_male);
-  }
-};
-
-}
-```
-Even for such a simple datatype, we have a significant amount of code that must be written and maintained.
-FlexFlow's codebase contains tens if not hundreds of these product types, and so the approach above is infeasible.
-
-[^1]: aka product types, aka Haskell's `data`. Essentially types that are just a tuple of fields with names.
-[^2]: by "plain old data" we refer to the general idea behind [C++'s POD](https://en.cppreference.com/w/cpp/named_req/PODType), but not its exact definition
-
-### Adding new visitable types
-
-FlexFlow's `visitable` support provides an easy way to express product types, and prevents any of the bugs listed above.
-To express the above definition of `Person` using `visitable`, we would write the following code:
-```cpp
-struct Person {
-  std::string first_name;
-  std::string last_name;
-  int age;
-  req<bool> is_male;
-};
-FF_VISITABLE_STRUCT(Person, first_name, last_name, age, is_male);
-```
-The key addition here is the calling the `FF_VISITABLE_STRUCT` macro. 
-In addition to defining all of the above functions, this macro also performs a series of compile-time checks (via `static_assert`) to check that the product type is implemented correctly (for example, it will check that the type is not default constructible[^3]).
-The only additional change is the addition of the `req` (which stands for `required`) wrapper on the last field. 
-Conceptually, `req` is simple: it removes default constructibility of the type it wraps (if the last field in the struct is already not default-constructible, no `req` is needed).
-Don't worry if you forget to add a `req`: `FF_VISITABLE_STRUCT` will check that your type properly disables default and partial construction (see [Macro Reference](#macro-reference)).
-Combined with [aggregate initialization](https://en.cppreference.com/w/cpp/language/aggregate_initialization), we are able to construct a `Person` as follows:
-```cpp
-Person p = { "donald", "knuth", 85, true };
-```
-and any subset of the fields would raise an error at compile time. Without any additional code, `Person` supports `operator==`, `operator!=`, `std::hash`, and `operator<`, as well as other more specific features (e.g., [JSON serialization](#json-serialization))
-
-[^3]: The full list of properties is detailed in [Macros Details](#macro-reference)
-
-### Limitations
-
-`visitable` types have two primary limitations. First, they do not support initialization with `(...)`:
-```cpp
-Person p{ "donald", "knuth", 85, true }; // CORRECT
-Person p2("robert", "tarjan", 75, true); // ERROR
-```
-Secondly, template types cannot be visitable (we hope to remove this limitation in the distant future), but instantiations of them can.
-```cpp
-template <typename T>
-struct MyLists {
-  std::vector<T> list1;
-  req<std::vector<T>> list2;
-};
-FF_VISITABLE_STRUCT(MyLists, list1, list2); // ERROR
-
-using MyInts = MyLists<int>;
-
-FF_VISITABLE_STRUCT(MyInts, list1, list2); // CORRECT
-```
-A smaller limitation is that `FF_VISITABLE_STRUCT` only works from within the `FlexFlow` namespace (this is not much of an issue as all of the `FlexFlow` code resides in a single namespace).
-
-### Advanced Features
-
-While `FF_VISITABLE_STRUCT` matches the behavior of many product types in FlexFlow's codebase, there are exceptions. Many of these resemble the code below:
-```cpp
-struct Cow { ... };
-
-struct TownPopulation {
-  std::vector<Person> people;
-  std::vector<Cow> cows;
-};
-```
-Unlike in the `Person` example, `TownPopulation` has an obvious default value: an empty town (i.e., both people and cow are empty).
-However, if we write
-```cpp
-FF_VISITABLE_STRUCT(TownPopulation, people, cows); // ERROR: TownPopulation should not be default constructible
-```
-we get the something approximating the error in the comment.
-If we were to abandon `visitable` entirely, we would have to write (**Note:** long code example to demonstrate how tedious this is, feel free to skim)
-```cpp
-struct Cow { ... };
-
-struct TownPopulation {
-  TownPopulation() = default;
-  TownPopulation(std::vector<Person> const &people,
-                 std::vector<Cow> const &cows)
-    : people(people), 
-      cows(cows)
-  { }
-
-  friend bool operator==(TownPopulation const &lhs, TownPopulation const &rhs) {
-    return lhs.people == rhs.people 
-      && lhs.cows == rhs.cows;
-  }
-
-  friend bool operator!=(TownPopulation const &lhs, TownPopulation const &rhs) {
-    return lhs.people != rhs.people
-      || lhs.cows != rhs.cows;
-  }
-
-  friend bool operator<(TownPopulation const &lhs, TownPopulation const &rhs) {
-    return lhs.people < rhs.people
-      || lhs.cows < rhs.cows;
-  }
-
-  std::vector<Person> people;
-  std::vector<Cow> cows;
-};
-
-namespace std {
-
-template <>
-struct hash<::TownPopulation> {
-  size_t operator()(::TownPopulation const &t) const {
-    size_t result = 0;
-    hash_combine(result, t.people);
-    hash_combine(result, t.cows);
-    return result;
-  }
-};
-
-}
-```
-which is tedious and bug-prone.
-To remove the constructibility checks performed by `FF_VISITABLE_STRUCT`, we simply use `FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION` instead:
-```cpp
-struct TownPopulation {
-  TownPopulation() = default;
-  TownPopulation(std::vector<Person> const &people,
-                 std::vector<Cow> const &cows)
-    : people(people), 
-      cows(cows)
-  { }
-
-  std::vector<Person> people;
-  std::vector<Cow> cows;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TownPopulation, people, cows);
-```
-This is also useful for defining structs with specific non-standard constructor signatures. For example,
-```cpp
-struct TownPopulation {
-  TownPopulation() = default;
-
-  // constructs a TownPopulation filled with the given number of random people and cows
-  TownPopulation(int num_people,
-                 int num_cows)
-    : people(generate_random_people_of_size(num_people)),
-      cows(generate_random_cows_of_size(num_cows))
-  { }
-
-  TownPopulation(std::vector<Person> const &people,
-                 std::vector<Cow> const &cows)
-    : people(people), 
-      cows(cows)
-  { }
-
-  std::vector<Person> people;
-  std::vector<Cow> cows;
-};
-```
-
-#### JSON Serialization
-
-TODO
-
-### Macro Reference
-
-The properties that are checked by each macro are as follows:
-
-1. `FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TYPENAME, ...fields...)`
-  - If `length(fields) > 0`:
-    - Every field in `TYPENAME` is `std::hash`able
-    - Every field in `TYPENAME` is listed under `fields`
-    - `TYPENAME` is copy constructible
-    - `TYPENAME` is move constructible
-    - `TYPENAME` is copy assignable
-    - `TYPENAME` is move assignable
-    - Every field in `TYPENAME` supports `operator==` 
-    - Every field in `TYPENAME` supports `operator!=`
-  - If `length(fields) == 0`:
-    - `TYPENAME` is copy constructible
-    - `TYPENAME` is move constructible
-    - `TYPENAME` is copy assignable
-    - `TYPENAME` is move assignable
-
-2. `FF_VISITABLE_STRUCT(TYPENAME, ...fields...)` (in addition to the checks in `FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION`)
-  - If `length(fields) > 0`:
-    - `TYPENAME` is only constructible when all fields are passed in[^4] 
-  - If `length(fields) == 0`:
-    - `TYPENAME` is default constructible
-
-[^4]: This is usually resolved by either wrapping the last field in a `req` or using `FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION`
-
-### Internals
-
-TODO
-
-## stack_vector, stack_string, stack_map
-
-## strong_typedef
-
-## containers
-
-## graph
-
-## bidict
-
-## type_traits
-
-## test_types
diff --git a/lib/utils/include/utils/cli/index.dox b/lib/utils/include/utils/cli/index.dox
new file mode 100644
index 0000000000..a89dca5364
--- /dev/null
+++ b/lib/utils/include/utils/cli/index.dox
@@ -0,0 +1,9 @@
+/**
+
+\page utils-cli utils/cli
+
+A basic CLI library for use by programs in \ref bin.
+For an example of how to use it, see the following snippet from \ref export-model-arch.
+
+\snippet bin/export-model-arch/src/export-model-arch/main.cc utils/cli example
+*/
diff --git a/lib/utils/include/utils/containers/index.dox b/lib/utils/include/utils/containers/index.dox
new file mode 100644
index 0000000000..38d050d67a
--- /dev/null
+++ b/lib/utils/include/utils/containers/index.dox
@@ -0,0 +1,12 @@
+/**
+
+\page utils-containers utils/containers
+
+A bunch of generic functions for transforming various C++ standard library containers.
+These should generally be preferred over raw for loops in %FlexFlow code.
+Some of the most commonly-used functions are listed below, but you should ideally slowly work to familiarize yourself with everything in this directory.
+
+- \ref containers/transform.h
+- \ref containers/filter.h
+
+*/
diff --git a/lib/utils/include/utils/graph/README.md b/lib/utils/include/utils/graph/README.md
index 5cf0c88015..78fdb4b6a1 100644
--- a/lib/utils/include/utils/graph/README.md
+++ b/lib/utils/include/utils/graph/README.md
@@ -2,7 +2,7 @@
 
 ## Design Considerations
 
-FlexFlow's graph library very intentionally attempts to balance performance and ease of use. 
+FlexFlow's graph library very intentionally attempts to balance performance and ease of use.
 The graph library aims to have a very simple external interface that is highly decoupled from the underlying representations, so performance and internal implementations can be tuned and modified over time without breaking the code that uses the library.
 Because FlexFlow's graphs are not on the scale of machine memory or not so large that single traversals takes nontrivial time, the graph library intentionally avoids performance opportunities that would expose many of these performance aspects to user code.
 Of course, there are also some optimizations that simply have not been done due to time constraints: for example, algorithms currently are able to be specialized for the underlying representation being used, but this could be added without modifying the user-side interface.
@@ -12,7 +12,7 @@ Of course, there are also some optimizations that simply have not been done due
 ### Core Graph Variants
 
 There is no single type of graph. Should it be directed? Allow multiple edges between nodes? Should nodes and/or edges have information attached?
-Because there is no single answer to this question, similar to [networkx](https://networkx.org/) we provide a number of different graph variants. 
+Because there is no single answer to this question, similar to [networkx](https://networkx.org/) we provide a number of different graph variants.
 At their core, they are as follows:
 
 - `UndirectedGraph`: at most one edge allowed between every pair of nodes, edges are undirected.
@@ -30,7 +30,7 @@ flowchart TD
     C(" ")
     D(" ")
     E(" ")
-    
+
     A --- B
     A --- C
     B --- C
@@ -83,9 +83,9 @@ This is the case with all of the 4 core graph classes.
 Nodes are of type `Node`, and from a user perspective are simply opaque handles, and source and destination indices should similarly be considered opaque from a user point of view.
 In addition, nodes should only be used in the context of their graph, so comparing or checking equality of nodes between different graphs (even of the same type) is undefined behavior[^1].
 
-All three core graph variants allow insertion and deletion of both edges and nodes. 
+All three core graph variants allow insertion and deletion of both edges and nodes.
 To add a node to an `UndirectedGraph g`, simply call `g.add_node()`, which will return a `Node` object.
-For semantics closer to `networkx`'s method of adding nodes, `g.add_node_unsafe(my_node)` can be used. This is useful when constructing a modified copy of an existing graph (given that it maintains node bijection), though it is not generally recommended. 
+For semantics closer to `networkx`'s method of adding nodes, `g.add_node_unsafe(my_node)` can be used. This is useful when constructing a modified copy of an existing graph (given that it maintains node bijection), though it is not generally recommended.
 The interface for node addition is identical for `DiGraph` and `MultiDiGraph`.
 To add an edge between two nodes `Node n1` and `Node n2` to an `UndirectedGraph g`, call `g.add_edge({n1, n2})`.
 In `UndirectedGraph` the order of the arguments of `add_edge` doesn't matter as edges are undirected, but the order does matter for `DiGraph`, `MultiDiGraph` and `DataflowGraph`.
@@ -93,7 +93,7 @@ In `UndirectedGraph` the order of the arguments of `add_edge` doesn't matter as
 The last paragraph covered the base API used to write to graphs, but we also want to be able to read from graphs.
 Reading from graphs is implemented with the `query_nodes` and `query_edges` methods, which can be thought of as executing a database query over the nodes and edges of the target graph, respectively (where queries are restricted to an incredibly simple set of operations).
 The argument to `query_nodes` is a `NodeQuery` (which is simply a set of `Node`s).
-`query_nodes` then returns the intersection of the nodes in the graph and the nodes in the query. 
+`query_nodes` then returns the intersection of the nodes in the graph and the nodes in the query.
 The set of nodes in the query is actually an `optional`, so `nullopt` could also be passed, which would simply retrieve all nodes from the target graph (essentially `nullopt` acts as the set of all nodes that could ever exist).
 `query_edges` functions similarly, but as with `add_edge` its behavior is differs slightly between the three graph variants.
 `UndirectedGraph::query_edges` simply takes an optional set of nodes and returns all edges that touch any of those nodes.
@@ -103,11 +103,11 @@ In practice you will rarely ever use `query_nodes` and `query_edges` as the grap
 The layer users will most commonly interact with is the interface provided within either the `algorithms.h` header files or the `algorithms` folders, present in their respective graph class folders.
 They provide a large number of pre-implemented algorithms on graphs, ranging from as simple as `get_nodes` to as complex as `get_transitive_reduction` and `get_dominators`.
 Note that, due to the internal virtual inheritance structure, some functions for more privitive classes can be employed by the derived classes. (For example, `get_nodes` present in `node/algorithms.h` can be used by `DiGraph`).
-You may notice that the most of algorithms present take as arguments not `UndirectedGraph`, `DiGraph`, and `MultiDiGraph`, but rather `UndirectedGraphView`, `DiGraphView`, and `MultiDiGraphView`. 
+You may notice that the most of algorithms present take as arguments not `UndirectedGraph`, `DiGraph`, and `MultiDiGraph`, but rather `UndirectedGraphView`, `DiGraphView`, and `MultiDiGraphView`.
 These `GraphView` objects represent read-only (i.e., immutable) graphs.
 Similar to C++'s `const` semantics, `Graph`s can be coerced[^2] to `GraphView`s but not the other way around.
 To transform a `GraphView` to a `Graph`, we can perform an explicit copy with `materialize_view`.
-Both `Graph` and `GraphView` types follow normal value semantics. 
+Both `Graph` and `GraphView` types follow normal value semantics.
 This may seem wasteful (oftentimes graphs are large objects that are passed around via reference to avoid making additional copies), but the `Graph` and `GraphView` types internally implement copy-on-write optimizations to only perform the minimum number of actual copies while maintaining immutability and lifetime safety (if you allocate a `DiGraph` use for example `get_subgraph` to get a `DiGraphView` representing a part of this graph, modifications to the underlying `DiGraph` will not be mirrored in the `DiGraphView` and the `DiGraphView` will remain valid even after the base `DiGraph` leaves scope.
 
 At this point, however, we still have not discussed how to create a graph.
@@ -128,11 +128,11 @@ At a high level, nodes represent multivariate functions (from tuples of inputs t
 
 `DataflowGraph` is similar to `MultiDiGraph`, but with the following important differences:
   - The edges entering, exiting a given nodes have a well-defined order.
-  - The outputs of a given node also have a well-defined order. 
+  - The outputs of a given node also have a well-defined order.
   - `DataflowGraph`s are directed acyclic graphs. This is enforced by the interface used to construct them, since a node can only be added to the graph after all of its predecessor nodes have already been added.
 
 The main components of `DataflowGraph` are as follows:
-- `DataflowInput`: used to denote an entry in the ordered sequence of incoming dependencies (arguments) of a given node (operator). 
+- `DataflowInput`: used to denote an entry in the ordered sequence of incoming dependencies (arguments) of a given node (operator).
 - `DataflowOutput`: used to denote an entry in the ordered sequence of outgoing results (value uses) from a given node (operator).
 - `DataflowEdge`: wrapper around a `DataflowInput`, `DataflowOutput` pair between 2 nodes.
 - `NodeAddedResult`: returned upon adding a new node. Contains the newly generated `Node` and the vector of `DataflowOutput`s for the given node.
@@ -141,7 +141,7 @@ The main components of `DataflowGraph` are as follows:
 
 ```cpp
     auto g = DataflowGraph::create<UnorderedSetDataflowGraph>();
-    
+
     // Node with no inputs and 2 outputs
     NodeAddedResult n1_result = g.add_node({}, 2);
     Node n1 = n1_result.node;
@@ -228,7 +228,7 @@ This graph class is particularly useful for processing a sub-graph of a given gr
 ### Labelled Dataflow Variant
 
 As nice as all of the above is, graphs without labels are mostly useless--in practice, nodes and edges represent some other system and the properties of that system (or at least a way to map the result of graph algorithms back to the underlying system) are necessary.
-Thus, FlexFlow's graph library provides the ability to add labels to `DataflowGraph`, through the `LabelleledDataflowGraph` and `OpenLabelleledDataflowGraph`, which allow users to label different components of the graph. 
+Thus, FlexFlow's graph library provides the ability to add labels to `DataflowGraph`, through the `LabelleledDataflowGraph` and `OpenLabelleledDataflowGraph`, which allow users to label different components of the graph.
 - `LabelledDataflowGraph` allows for labelling of `Node`s and `DataflowOutput`s.
 - `OpenLabelledDataflowGraph` allows for labelling of `Node`s and `OpenDataflowValue`s, which is a variant describing both `DataflowOutput`s and `DataflowGraphInput`s.
 
@@ -252,7 +252,7 @@ Most of the major graph classes in the library come in sets of 4. For a given cl
 General rules which apply to most classes:
 - `ClassName` (virtually) inherits from `ClassNameView`. Similarly, `IClassName` (virtually) inherits from `IClassNameView`.
 - `ClassName` has, as a member variable, a `cow_ptr` of type `IClassName`. Same holds for `ClassNameView`.
-Thus, the bulk of the inheritance that actually extends functionality is present among `IClassNameView` classes. 
+Thus, the bulk of the inheritance that actually extends functionality is present among `IClassNameView` classes.
 
 
 ### cow_ptr and Interfaces
@@ -274,4 +274,4 @@ All member functions present in `ClassName` and `ClassNameView` delegate their c
 ### Virtual Inheritance
 Due to the complexity of the graph library, diamond-style inheritance patterns emerge.
 In the case of a diamond inheritance pattern, C++ will instantiate multiple copies of the base class whenever we instantiate a derived class.
-To address this issue, we employ [Virtual Inheritance](https://en.wikipedia.org/wiki/Virtual_inheritance), which removes the ambiguity associated with the multiple copies.
+To address this issue, we employ <a href="https://en.wikipedia.org/wiki/Virtual_inheritance">virtual inheritance</a>, which removes the ambiguity associated with the multiple copies.
diff --git a/lib/utils/include/utils/graph/index.dox b/lib/utils/include/utils/graph/index.dox
new file mode 100644
index 0000000000..68a6d05fd1
--- /dev/null
+++ b/lib/utils/include/utils/graph/index.dox
@@ -0,0 +1,50 @@
+/**
+
+\page utils-graph utils/graph
+
+\note This documentation is somewhat out of date and, more importantly, \c utils/graph is in rather dire need of a reorganization, so take these docs with a grain of salt.
+
+\section design-considerations Design Considerations
+
+FlexFlow's graph library very intentionally attempts to balance performance and ease of use.
+The graph library aims to have a very simple external interface that is highly decoupled from the underlying representations, so performance and internal implementations can be tuned and modified over time without breaking the code that uses the library.
+Because FlexFlow's graphs are not on the scale of machine memory or not so large that single traversals takes nontrivial time, the graph library intentionally avoids performance opportunities that would expose many of these performance aspects to user code.
+Of course, there are also some optimizations that simply have not been done due to time constraints: for example, algorithms currently are able to be specialized for the underlying representation being used, but this could be added without modifying the user-side interface.
+
+\section usage Usage
+
+\subsection core-graph-variants Core Graph Variants
+
+\subsection dataflow-graph DataflowGraph
+
+\subsection open-dataflow-variant Open Dataflow Variant
+
+\subsection labelled-dataflow-variant Labelled Dataflow Variant
+
+\section graph-internals Internals
+
+\subsection cow-ptr-and-interfaces cow_ptr and Interfaces
+
+The reason for the existence of the \c View variants has been explained in previous sections.
+The existence of the \c "I(nterface)" variants stems from C++'s approach to modeling polymorphism.
+
+C++ polymorphism is achieved at runtime through the use of <a href="https://www.learncpp.com/cpp-tutorial/virtual-functions/">virtual functions</a>, which allow for a single function defined on some superclass to also work correctly on its subclasses.
+
+To create objects with polymorphic behaviour, we use the following syntax:
+\code
+BaseClass* obj = new DerivedClass(); //or alternatives such as std::shared_ptr<BaseClass> obj = std::make_shared<DerivedClass>();
+\endcode
+Any call to \c obj 's member functions are resolved at runtime (dynamic binding), with C++ calling the most derived implementation of the function.
+
+While this pattern works nicely, the way instantiation is done leaves the burden of memory management on the user.
+To address this, graph classes store a \ref cow_ptr as a member variable, which point to instances of type equal to their corresponding interface class.
+
+All member functions present in \c ClassName and \c ClassNameView delegate their calls to their corresponding interface classes (which implement the actual logic), meaning that these classes essentially act as wrappers to their interface counterparts.
+
+\subsection virtual-inheritance Virtual Inheritance
+
+Due to the complexity of the graph library, diamond-style inheritance patterns emerge.
+In the case of a diamond inheritance pattern, C++ will instantiate multiple copies of the base class whenever we instantiate a derived class.
+To address this issue, we employ <a href="https://en.wikipedia.org/wiki/Virtual_inheritance">virtual inheritance</a>, which removes the ambiguity associated with the multiple copies.
+
+*/
diff --git a/lib/utils/index.dox b/lib/utils/index.dox
new file mode 100644
index 0000000000..bff8eefb26
--- /dev/null
+++ b/lib/utils/index.dox
@@ -0,0 +1,9 @@
+/**
+
+@page utils utils
+
+- \subpage utils-containers
+- \subpage utils-cli
+- \subpage utils-graph
+
+*/

From 045a8857cb39af6f230359b42c307d98465b5519 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 4 Mar 2026 02:28:30 -0800
Subject: [PATCH 090/113] Namespace declarations in dox files and other
 incremental improvements

---
 bin/export-model-arch/index.dox               | 13 ++++++-----
 bin/index.dox                                 |  2 +-
 index.dox                                     |  8 +++++++
 .../compiler/machine_mapping/index.dox        |  8 ++++---
 lib/compiler/index.dox                        |  5 ++--
 lib/index.dox                                 | 23 ++++++++-----------
 .../include/models/split_test/index.dox       |  7 ++++++
 lib/models/index.dox                          |  4 ++++
 lib/op-attrs/include/op-attrs/ops/index.dox   |  2 +-
 lib/op-attrs/index.dox                        |  5 ++--
 lib/pcg/include/pcg/file_format/v1/index.dox  |  2 +-
 lib/pcg/index.dox                             | 10 ++++----
 lib/substitutions/index.dox                   | 12 ++++++----
 lib/task-spec/index.dox                       |  2 ++
 lib/utils/index.dox                           |  2 ++
 15 files changed, 68 insertions(+), 37 deletions(-)
 create mode 100644 index.dox
 create mode 100644 lib/models/include/models/split_test/index.dox

diff --git a/bin/export-model-arch/index.dox b/bin/export-model-arch/index.dox
index d804a424b7..c969e6481f 100644
--- a/bin/export-model-arch/index.dox
+++ b/bin/export-model-arch/index.dox
@@ -1,9 +1,10 @@
+namespace FlexFlow {
 /**
 
 @page export-model-arch export-model-arch
 
-A tool for exporting and visualizing the model computation graphs defined in @ref models.
-To build and run export-model-arch, run the following commands from the root of the FlexFlow Train repository:
+A tool for exporting (for details of the file format, see \ref file-format) and visualizing the model ComputationGraphs defined in @ref models.
+To build and run \c export-model-arch, run the following commands from the root of the %FlexFlow %Train repository:
 
 \verbatim
 $ proj cmake # if you haven't already
@@ -15,20 +16,20 @@ $ ./build/normal/bin/export-model-arch/export-model-arch -h
 
 The above should print the help message for `export-model-arch`. A few example commands are also listed below:
 
-- Export the `split_test` model in JSON (e.g., for processing outside of FlexFlow Train):
-
+- Export the \ref split-test model in JSON (e.g., for processing outside of %FlexFlow %Train):
 \verbatim
 $ ./build/normal/bin/export-model-arch/export-model-arch split_test
 \endverbatim
 
-- Export the `split_test` model in JSON along with the SP decomposition of the model's computation graph:
+- Export the \ref split-test model in JSON along with the SP decomposition of the model's ComputationGraph:
 \verbatim
 $ ./build/normal/bin/export-model-arch/export-model-arch --sp-decomposition split_test
 \endverbatim
 
-- Export the `split_test` model as DOT (e.g., for visualization using a [local](https://github.com/jrfonseca/xdot.py) or [web-based](https://dreampuf.github.io/GraphvizOnline/) DOT viewer)
+- Export the \ref split-test model as DOT (e.g., for visualization using a [local](https://github.com/jrfonseca/xdot.py) or [web-based](https://dreampuf.github.io/GraphvizOnline/) DOT viewer)
 \verbatim
 $ ./build/normal/bin/export-model-arch/export-model-arch --dot split_test
 \endverbatim
 
 */
+}
diff --git a/bin/index.dox b/bin/index.dox
index 08d73c1d81..04ee338fb6 100644
--- a/bin/index.dox
+++ b/bin/index.dox
@@ -1,6 +1,6 @@
 /**
 
-\mainpage bin
+\page bin bin/
 
 This directory contains command-line interfaces for %FlexFlow %Train and associated tools (all in C++).
 
diff --git a/index.dox b/index.dox
new file mode 100644
index 0000000000..73318d5315
--- /dev/null
+++ b/index.dox
@@ -0,0 +1,8 @@
+/**
+
+\mainpage FlexFlow
+
+- \subpage bin
+- \subpage lib
+
+*/
diff --git a/lib/compiler/include/compiler/machine_mapping/index.dox b/lib/compiler/include/compiler/machine_mapping/index.dox
index 2858103b2d..67452f2cb7 100644
--- a/lib/compiler/include/compiler/machine_mapping/index.dox
+++ b/lib/compiler/include/compiler/machine_mapping/index.dox
@@ -1,12 +1,14 @@
+namespace FlexFlow {
 /**
 
 @page machine-mapping Machine Mapping
 
-Contains the representations and logic for mappings of operators to machines/devices/GPUs.
+@brief Contains the representations and logic for mappings of operators to machines/devices/GPUs.
 
 Core functionality includes:
-- \ref FlexFlow::MachineView "MachineView": the compiler-side representation of a mapping.
-  For the runtime-side representation, see \ref FlexFlow::MappedOperatorTaskGroup
+- \ref MachineView "": the compiler-side representation of a mapping.
+  For the runtime-side representation, see \ref MappedOperatorTaskGroup.
 - \ref allowed_machine_views.h
 
 */
+}
diff --git a/lib/compiler/index.dox b/lib/compiler/index.dox
index bcf048cb1d..29feab4dcc 100644
--- a/lib/compiler/index.dox
+++ b/lib/compiler/index.dox
@@ -1,8 +1,9 @@
 /**
 
-@page compiler compiler
+\page compiler compiler/
+
+\brief Implements the core logic of the compiler.
 
-Implements the core logic of the compiler.
 This includes:
 
 - \subpage unity-dp-algorithm "Unity DP Algorithm"
diff --git a/lib/index.dox b/lib/index.dox
index 618f53104e..f5e6ff5294 100644
--- a/lib/index.dox
+++ b/lib/index.dox
@@ -1,22 +1,18 @@
+namespace FlexFlow {
 /**
 
-\mainpage lib
+\page lib lib/
 
 This directory contains the core C++ code that underlies %FlexFlow, organized into the following libraries:
 
-- \subpage compiler "":
+- \subpage compiler "": \copybrief compiler
 - \subpage kernels "":
-- \subpage op-attrs "":
-- \subpage pcg "": Contains the definitions of \ref FlexFlow::ComputationGraph and
-  \ref FlexFlow::ParallelComputationGraph, as well as code for serializing and deserializing
-  both graphs (\ref file-format)
-- \subpage substitutions "substitutions": Contains the definitions of pcg substitutions
-  (i.e., FlexFlow::Substitution), as well as the code for serializing them
-- \subpage utils "": Various utility and support libraries for the rest of the
-  project. Particularly useful are @ref "utils-graph", @ref "utils-containers",
-  and @ref "utils-cli".
-- \subpage models "":
-- \subpage task-spec "":
+- \subpage op-attrs "": \copybrief op-attrs
+- \subpage pcg "": \copybrief pcg
+- \subpage substitutions "": \copybrief substitutions
+- \subpage utils "": \copybrief utils
+- \subpage models "": \copybrief models
+- \subpage task-spec "": \copybrief task-spec
 - \subpage local-execution "":
 - \subpage realm-execution "":
 
@@ -27,3 +23,4 @@ This directory contains the core C++ code that underlies %FlexFlow, organized in
 - \c "substitution-generator":
 - \c "runtime": Out-of-date code migrated from the old %FlexFlow codebase. Currently kept around for reference, but will eventually be removed.
 */
+}
diff --git a/lib/models/include/models/split_test/index.dox b/lib/models/include/models/split_test/index.dox
new file mode 100644
index 0000000000..c5aa84706a
--- /dev/null
+++ b/lib/models/include/models/split_test/index.dox
@@ -0,0 +1,7 @@
+namespace FlexFlow {
+/**
+
+@page split-test models/split_test/
+
+*/
+}
diff --git a/lib/models/index.dox b/lib/models/index.dox
index 5ca79b3b34..9b9b308976 100644
--- a/lib/models/index.dox
+++ b/lib/models/index.dox
@@ -1,7 +1,10 @@
+namespace FlexFlow {
 /**
 
 \page models models
 
+\brief Pre-built \ref ComputationGraph ""s for various models for use in testing and evalutation.
+
 \section real-models Real Models
 - \subpage bert "BERT"
 - \subpage candle-uno "Candle UNO"
@@ -13,3 +16,4 @@
 - \subpage split-test
 
 */
+}
diff --git a/lib/op-attrs/include/op-attrs/ops/index.dox b/lib/op-attrs/include/op-attrs/ops/index.dox
index 4881221743..28a8e61c04 100644
--- a/lib/op-attrs/include/op-attrs/ops/index.dox
+++ b/lib/op-attrs/include/op-attrs/ops/index.dox
@@ -1,5 +1,5 @@
 /**
 
-\page op-attrs-ops "Operator Descriptions"
+\page op-attrs-ops op-attrs/ops/
 
 */
diff --git a/lib/op-attrs/index.dox b/lib/op-attrs/index.dox
index 6d3d6d8f60..677c01ef40 100644
--- a/lib/op-attrs/index.dox
+++ b/lib/op-attrs/index.dox
@@ -1,8 +1,9 @@
 /**
 
-\page op-attrs op-attrs
+\page op-attrs op-attrs/
+
+\brief Contains the compiler-side definition of all of the operators and associated functions for reasoning about their behavior, as well as the fundamental concepts needed to represent them.
 
-Contains the compiler-side definition of all of the operators and associated functions for reasoning about their behavior, as well as the fundamental concepts needed to represent them.
 Key pieces include:
 
 - Representing tensors in the compiler:
diff --git a/lib/pcg/include/pcg/file_format/v1/index.dox b/lib/pcg/include/pcg/file_format/v1/index.dox
index e6d0d4be4f..fba7f69017 100644
--- a/lib/pcg/include/pcg/file_format/v1/index.dox
+++ b/lib/pcg/include/pcg/file_format/v1/index.dox
@@ -1,5 +1,5 @@
 /**
 
-@page file-format pcg/file_format/v1
+@page file-format pcg/file_format/v1/
 
 */
diff --git a/lib/pcg/index.dox b/lib/pcg/index.dox
index 721c2ba062..55b478edf0 100644
--- a/lib/pcg/index.dox
+++ b/lib/pcg/index.dox
@@ -1,18 +1,19 @@
+namespace FlexFlow{
 /**
 
-\page pcg pcg
+\page pcg pcg/
 
-Defines the top-level datastructures and their serialization formats, along with some helper interfaces for constructing and manipulating them.
+@brief Defines the top-level datastructures (ComputationGraph, ParallelComputationGraph, and MappedParallelComputationGraph) and their serialization formats, along with some helper interfaces for constructing and manipulating them.
 
 \section pcg-datastructures Key Datastructures
 
-- \ref FlexFlow::ComputationGraph "ComputationGraph": aka CG
+- \ref ComputationGraph "": aka CG
 - \ref FlexFlow::ParallelComputationGraph "ParallelComputationGraph": aka PCG
 - \ref FlexFlow::MappedParallelComputationGraph "MappedParallelComputationGraph": aka MPCG
 
 \section serialization-formats Serialization
 
-- \subpage file-format "pcg/file_format"
+- \subpage file-format "pcg/file_format/"
 
 \section pcg-helpers Helper Functionality
 
@@ -21,3 +22,4 @@ Defines the top-level datastructures and their serialization formats, along with
 
 
 */
+}
diff --git a/lib/substitutions/index.dox b/lib/substitutions/index.dox
index 2c65c7362f..120cc164d2 100644
--- a/lib/substitutions/index.dox
+++ b/lib/substitutions/index.dox
@@ -1,14 +1,17 @@
+namespace FlexFlow {
 /**
 
 \page substitutions substitutions
 
+\brief Contains the definitions of pcg substitutions (i.e., Substitution), as well as the code for serializing them.
+
 \section substitution Substitution
 
-A \ref ::FlexFlow::Substitution is to replace a subgraph of the PCG by a new one. We refer to the subgraph to be replaced as the input graph, and the new subgraph to replace the input graph as the output graph.
+A \ref Substitution is to replace a subgraph of the PCG by a new one. We refer to the subgraph to be replaced as the input graph, and the new subgraph to replace the input graph as the output graph.
 
 \section pattern-matches PCGPattern and MultiDiGraphPatternMatch
 
-A \ref ::FlexFlow::PCGPattern is defined as an open graph with node label ::FlexFlow::OperatorPattern` and output label `ParallelTensorPattern`, which is refered to as the pattern graph. The graph structure of a `GraphPattern` instance defines the geometrical property of the input graph, while the node labels and output labels define the attribute property of that.
+A \ref PCGPattern is defined as an open graph with node label OperatorPattern and output label ParallelTensorPattern, which is refered to as the pattern graph. The graph structure of a GraphPattern instance defines the geometrical property of the input graph, while the node labels and output labels define the attribute property of that.
 
 To apply a substitution to a PCG, we should first match the pattern graph to a subgraph of the PCG. `MultiDiGraphPatternMatch` describes the match, which consists of
 * `node_assignment`: a mapping from the nodes of the pattern graph to the nodes of the PCG; and
@@ -19,8 +22,9 @@ The input graph derived by this match is then defined by `values(node_assignment
 
 \section output-graph-expr OutputGraphExpr
 
-An \ref ::FlexFlow::OutputGraphExpr is defined as an open graph with node label \ref ::FlexFlow::OutputOperatorAttrAssignment and output label \ref std::monostate.
+An \ref OutputGraphExpr is defined as an open graph with node label \ref OutputOperatorAttrAssignment and output label \ref std::monostate.
 
-\ref ::FlexFlow::OutputOperatorAttrAssignment is a collection of \ref ::FlexFlow::OperatorAttributeKey and \ref ::FlexFlow::OutputOperatorAttributeExpr pairs. It defines how the attributes of a single operator is calculated from the input graph. A pair `{operator_attribute_key, output_operator_attribute_expr}` in the collection means the value of `output_operator_attribute_expr` is assigned to the attribute named `operator_attribute_key` of the operator.
+\ref OutputOperatorAttrAssignment is a collection of \ref OperatorAttributeKey and \ref OutputOperatorAttributeExpr pairs. It defines how the attributes of a single operator is calculated from the input graph. A pair \c "{operator_attribute_key, output_operator_attribute_expr}" in the collection means the value of \c output_operator_attribute_expr is assigned to the attribute named \c operator_attribute_key of the operator.
 
 */
+}
diff --git a/lib/task-spec/index.dox b/lib/task-spec/index.dox
index bad8e8d5e1..7e71232cd3 100644
--- a/lib/task-spec/index.dox
+++ b/lib/task-spec/index.dox
@@ -2,4 +2,6 @@
 
 @page task-spec task-spec
 
+\brief An intermediate layer between the compiler and the runtime. Contains code for lowering the \ref MappedParallelComputationGraph exported from the \ref compiler down to a granularity that the runtime can actually execute. Also contains the functions that translate between logical operators (i.e., \ref op-attrs-ops) and actual calls to \ref kernels.
+
 */
diff --git a/lib/utils/index.dox b/lib/utils/index.dox
index bff8eefb26..07be0c58ce 100644
--- a/lib/utils/index.dox
+++ b/lib/utils/index.dox
@@ -2,6 +2,8 @@
 
 @page utils utils
 
+@brief Various utility and support libraries for the rest of the project. Particularly useful are @ref "utils-graph", @ref "utils-containers", and @ref "utils-cli".
+
 - \subpage utils-containers
 - \subpage utils-cli
 - \subpage utils-graph

From bc4c015b311fd87bc516925f3a02f69651ee57d3 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 4 Mar 2026 23:42:03 -0800
Subject: [PATCH 091/113] Flesh out a bunch of high-level docs

---
 bin/protobuf-to-json/README.md                |   3 -
 bin/protobuf-to-json/index.dox                |   7 +
 bin/substitution-to-dot/README.md             |   3 -
 bin/substitution-to-dot/index.dox             |   7 +
 docs/doxygen/Doxyfile                         |   4 +-
 flake.nix                                     |   1 +
 lib/index.dox                                 |  55 +++-
 lib/kernels/index.dox                         |   4 +-
 lib/local-execution/index.dox                 |   5 +
 lib/op-attrs/include/op-attrs/ops/index.dox   |  43 +++
 .../op-attrs/ops/linear_attrs.dtg.toml        |   5 +
 lib/op-attrs/src/op-attrs/ops/linear.cc       |   2 +
 .../dynamic_tensor_accessor_from_instance.h   |   3 +
 .../{pcg_instance => }/pcg_instance.h         |  34 +-
 .../include/realm-execution/realm_context.h   |   2 +-
 .../include/realm-execution/realm_manager.h   |   5 +-
 .../tasks/impl/controller_task.h              |   8 +
 ...rn_task.h => ff_handle_init_return_task.h} |   4 +-
 ...ndle_init_task.h => ff_handle_init_task.h} |   8 +-
 ...toml => ff_handle_init_task_args.dtg.toml} |   2 +-
 .../realm-execution/tasks/impl/index.dox      |  16 +
 .../realm-execution/tasks/impl/op_task.h      |   8 +-
 ...=> per_device_op_state_init_return_task.h} |   8 +-
 ...task.h => per_device_op_state_init_task.h} |   8 +-
 ...r_device_op_state_init_task_args.dtg.toml} |   0
 ...erializable_device_handle_init_task_args.h |  17 -
 ...lizable_ff_handle_init_task_args.dtg.toml} |   2 +-
 .../serializable_ff_handle_init_task_args.h   |  18 +
 ...r_device_op_state_init_task_args.dtg.toml} |   0
 ...able_per_device_op_state_init_task_args.h} |   0
 .../tensor_instance_backing.dtg.toml          |   5 +
 lib/realm-execution/index.dox                 |  41 ++-
 .../{pcg_instance => }/pcg_instance.cc        |   8 +-
 ..._task.cc => ff_handle_init_return_task.cc} |  20 +-
 ...le_init_task.cc => ff_handle_init_task.cc} |  28 +-
 ...> per_device_op_state_init_return_task.cc} |  20 +-
 ...sk.cc => per_device_op_state_init_task.cc} |  28 +-
 ... serializable_ff_handle_init_task_args.cc} |   0
 ...ble_per_device_op_state_init_task_args.cc} |   0
 .../test/src/realm-execution/test_e2e.cc      | 307 +++++++++---------
 .../include/task-spec/dynamic_graph/index.dox |  17 +
 lib/task-spec/include/task-spec/ops/index.dox |   9 +
 .../task_argument_accessor/index.dox          |  82 +++++
 lib/task-spec/index.dox                       |   9 +-
 lib/utils/include/utils/containers/index.dox  |   6 +
 lib/utils/include/utils/orthotope/index.dox   |   7 +
 lib/utils/index.dox                           |  30 +-
 47 files changed, 619 insertions(+), 280 deletions(-)
 delete mode 100644 bin/protobuf-to-json/README.md
 create mode 100644 bin/protobuf-to-json/index.dox
 delete mode 100644 bin/substitution-to-dot/README.md
 create mode 100644 bin/substitution-to-dot/index.dox
 rename lib/realm-execution/include/realm-execution/{pcg_instance => }/pcg_instance.h (83%)
 rename lib/realm-execution/include/realm-execution/tasks/impl/{device_handle_init_return_task.h => ff_handle_init_return_task.h} (88%)
 rename lib/realm-execution/include/realm-execution/tasks/impl/{device_handle_init_task.h => ff_handle_init_task.h} (80%)
 rename lib/realm-execution/include/realm-execution/tasks/impl/{device_handle_init_task_args.dtg.toml => ff_handle_init_task_args.dtg.toml} (93%)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/index.dox
 rename lib/realm-execution/include/realm-execution/tasks/impl/{device_state_init_return_task.h => per_device_op_state_init_return_task.h} (75%)
 rename lib/realm-execution/include/realm-execution/tasks/impl/{device_state_init_task.h => per_device_op_state_init_task.h} (86%)
 rename lib/realm-execution/include/realm-execution/tasks/impl/{device_state_init_task_args.dtg.toml => per_device_op_state_init_task_args.dtg.toml} (100%)
 delete mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
 rename lib/realm-execution/include/realm-execution/tasks/impl/{serializable_device_handle_init_task_args.dtg.toml => serializable_ff_handle_init_task_args.dtg.toml} (90%)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h
 rename lib/realm-execution/include/realm-execution/tasks/impl/{serializable_device_state_init_task_args.dtg.toml => serializable_per_device_op_state_init_task_args.dtg.toml} (100%)
 rename lib/realm-execution/include/realm-execution/tasks/impl/{serializable_device_state_init_task_args.h => serializable_per_device_op_state_init_task_args.h} (100%)
 rename lib/realm-execution/src/realm-execution/{pcg_instance => }/pcg_instance.cc (99%)
 rename lib/realm-execution/src/realm-execution/tasks/impl/{device_handle_init_return_task.cc => ff_handle_init_return_task.cc} (73%)
 rename lib/realm-execution/src/realm-execution/tasks/impl/{device_handle_init_task.cc => ff_handle_init_task.cc} (74%)
 rename lib/realm-execution/src/realm-execution/tasks/impl/{device_state_init_return_task.cc => per_device_op_state_init_return_task.cc} (71%)
 rename lib/realm-execution/src/realm-execution/tasks/impl/{device_state_init_task.cc => per_device_op_state_init_task.cc} (82%)
 rename lib/realm-execution/src/realm-execution/tasks/impl/{serializable_device_handle_init_task_args.cc => serializable_ff_handle_init_task_args.cc} (100%)
 rename lib/realm-execution/src/realm-execution/tasks/impl/{serializable_device_state_init_task_args.cc => serializable_per_device_op_state_init_task_args.cc} (100%)
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/index.dox
 create mode 100644 lib/task-spec/include/task-spec/ops/index.dox
 create mode 100644 lib/task-spec/include/task-spec/task_argument_accessor/index.dox
 create mode 100644 lib/utils/include/utils/orthotope/index.dox

diff --git a/bin/protobuf-to-json/README.md b/bin/protobuf-to-json/README.md
deleted file mode 100644
index a1b1406e8b..0000000000
--- a/bin/protobuf-to-json/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# protobuf-to-json
-
-TODO
diff --git a/bin/protobuf-to-json/index.dox b/bin/protobuf-to-json/index.dox
new file mode 100644
index 0000000000..6c95370947
--- /dev/null
+++ b/bin/protobuf-to-json/index.dox
@@ -0,0 +1,7 @@
+namespace FlexFlow {
+/**
+
+\page protobuf-to-json
+
+*/
+}
diff --git a/bin/substitution-to-dot/README.md b/bin/substitution-to-dot/README.md
deleted file mode 100644
index 931c3cbdd3..0000000000
--- a/bin/substitution-to-dot/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# substitution-to-dot
-
-TODO
diff --git a/bin/substitution-to-dot/index.dox b/bin/substitution-to-dot/index.dox
new file mode 100644
index 0000000000..abbcbed5c6
--- /dev/null
+++ b/bin/substitution-to-dot/index.dox
@@ -0,0 +1,7 @@
+namespace FlexFlow {
+/**
+
+\page substitution-to-dot
+
+*/
+}
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index d68de44c85..3b20f35946 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -2435,7 +2435,7 @@ HIDE_UNDOC_RELATIONS   = YES
 # set to NO
 # The default value is: NO.
 
-HAVE_DOT               = NO
+HAVE_DOT               = YES
 
 # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
 # to run in parallel. When set to 0 doxygen will base this on the number of
@@ -2654,7 +2654,7 @@ DOT_IMAGE_FORMAT       = png
 # The default value is: NO.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-INTERACTIVE_SVG        = NO
+INTERACTIVE_SVG        = YES
 
 # The DOT_PATH tag can be used to specify the path where the dot tool can be
 # found. If left blank, it is assumed the dot tool can be found in the path.
diff --git a/flake.nix b/flake.nix
index dad0e2fc32..3e5c477dea 100644
--- a/flake.nix
+++ b/flake.nix
@@ -118,6 +118,7 @@
               compdb
               gbenchmark
               libtorch-bin
+              graphviz # for documentation
             ])
             (with proj-repo.packages.${system}; [
               proj
diff --git a/lib/index.dox b/lib/index.dox
index f5e6ff5294..6925d77249 100644
--- a/lib/index.dox
+++ b/lib/index.dox
@@ -6,17 +6,64 @@ namespace FlexFlow {
 This directory contains the core C++ code that underlies %FlexFlow, organized into the following libraries:
 
 - \subpage compiler "": \copybrief compiler
-- \subpage kernels "":
+- \subpage kernels "": \copybrief kernels
 - \subpage op-attrs "": \copybrief op-attrs
 - \subpage pcg "": \copybrief pcg
 - \subpage substitutions "": \copybrief substitutions
 - \subpage utils "": \copybrief utils
 - \subpage models "": \copybrief models
 - \subpage task-spec "": \copybrief task-spec
-- \subpage local-execution "":
-- \subpage realm-execution "":
+- \subpage local-execution "": \copybrief local-execution
+- \subpage realm-execution "": \copybrief realm-execution
 
-\section Deprecated
+\section runtime-vs-compiler Runtime vs. Compiler
+
+Logically, the functionality in \c lib/ be split into two conceptual categories: the \e compiler and the \e runtime:
+
+- The compiler takes in a \ref ComputationGraph provided by the end user and transforms it into an optimized \ref MappedParallelComputationGraph.
+- The runtime takes in a \ref MappedParallelComputationGraph, which can be from the compiler or from any other tool that can generate files in FlexFlow's format (see \ref file-format for more details) and executes it, i.e., performs training iterations using the (usually distributed) execution strategy specified by the \ref MappedParallelComputationGraph.
+
+The distinction between the two in terms of libraries is a bit less clear.
+A few libraries are used exclusively in the compiler (i.e., \ref compiler) or in the runtime (\ref realm-execution), but most are used to some degree in both: for example, while \ref op-attrs is specified to contain the compiler-side representations of the operators, these representations are included in the \ref MappedParallelComputationGraph that is executed by the runtime, and so the library is actually used by both.
+In practice we use the shorthands "runtime-side" and "compiler-side" to refer to the following division of libraries:
+- compiler-side: \ref utils, \ref op-attrs, \ref pcg, \ref substitutions, and \ref compiler
+- runtime-side: \ref task-spec, \ref kernels, \ref local-execution, and \ref realm-execution.
+- neither: \ref models.
+
+The full (transitively-reduced) dependency graph of the libraries is as follows:
+
+\dot
+digraph example {
+    utils          [label="utils", URL="\ref utils", color="forestgreen", fontcolor="forestgreen"];
+    opattrs        [label="op-attrs", URL="\ref op-attrs", color="forestgreen", fontcolor="forestgreen"];
+    pcg            [label="pcg-attrs", URL="\ref pcg", color="forestgreen", fontcolor="forestgreen"];
+    substitutions  [label="substitutions", URL="\ref substitutions", color="forestgreen", fontcolor="forestgreen"];
+    models         [label="models", URL="\ref models"];
+    compiler       [label="compiler", URL="\ref compiler", color="forestgreen", fontcolor="forestgreen"];
+    kernels        [label="kernels", URL="\ref kernels", color="red", fontcolor="red"];
+    taskspec       [label="task-spec", URL="\ref task-spec", color="red", fontcolor="red"];
+    localexecution [label="local-execution", URL="\ref local-execution", color="red", fontcolor="red"];
+    realmexecution [label="realm-execution", URL="\ref realm-execution", color="red", fontcolor="red"];
+    realm          [label="realm", URL="\ref realm", color="red", fontcolor="red", style="dashed"];
+
+    utils -> opattrs;
+    opattrs -> pcg;
+    pcg -> substitutions;
+    substitutions -> compiler;
+    pcg -> models;
+    pcg -> kernels;
+    localexecution -> compiler [ style="dashed" ];
+    kernels -> taskspec;
+    taskspec -> localexecution;
+    localexecution -> realmexecution;
+    realm -> realmexecution;
+    realm -> kernels [ style="dashed" ];
+}
+\enddot
+
+where solid arrows represent link-time dependencies and dashed arrows represent run-time-only dependencies.
+
+\section lib-deprecated-components Deprecated Components
 
 - \c "local-pcg-execution":
 - \c "ffi":
diff --git a/lib/kernels/index.dox b/lib/kernels/index.dox
index 8ca8ad33ea..6465e11307 100644
--- a/lib/kernels/index.dox
+++ b/lib/kernels/index.dox
@@ -1,5 +1,7 @@
 /**
 
-\page kernels kernels
+\page kernels kernels/
+
+\brief %CPU and %GPU implementations of the operators, for use in the runtime and in operator profiling.
 
 */
diff --git a/lib/local-execution/index.dox b/lib/local-execution/index.dox
index aeaf73fc0f..0c7b06820e 100644
--- a/lib/local-execution/index.dox
+++ b/lib/local-execution/index.dox
@@ -1,7 +1,11 @@
+namespace FlexFlow {
 /**
 
 \page local-execution local-execution
 
+\brief Executes non-distributed \ref DynamicOpenDataflowGraph on local devices without using realm.
+       Used both for testing and (eventually by \ref realm-execution) for fusing operator task launches.
+
 The primary external-facing interface of local-execution.
 
 Major components:
@@ -17,3 +21,4 @@ Major components:
 - \ref "tensor_allocation.h": a pass for the dataflow graph that allocates all tensors
 
 */
+}
diff --git a/lib/op-attrs/include/op-attrs/ops/index.dox b/lib/op-attrs/include/op-attrs/ops/index.dox
index 28a8e61c04..e8c9f3b31e 100644
--- a/lib/op-attrs/include/op-attrs/ops/index.dox
+++ b/lib/op-attrs/include/op-attrs/ops/index.dox
@@ -1,5 +1,48 @@
+namespace FlexFlow {
 /**
 
 \page op-attrs-ops op-attrs/ops/
 
+\brief Contains the compiler-side definitions of the operators.
+
+More specifically, this consists of the following pieces:
+
+- A representation of the operator attributes (e.g., @ref LinearAttrs)
+- Functions for inferring weight and output shapes from the set of input shapes, e.g.,
+  - \ref "get_projection_shape(LinearAttrs const &, TensorShape const &)",
+  - \ref "get_bias_shape(LinearAttrs const &, TensorShape const &)", and
+  - \ref "get_output_shape(LinearAttrs const &, TensorShape const &)").
+
+  This procedure is termed <em>shape inference</em>.
+
+- Functions for inferring the parallelized weight and output shapes from the shapes of the parallelized input tensors, e.g.,
+  - \ref "get_projection_shape(LinearAttrs const &, ParallelTensorShape const &)",
+  - \ref "get_bias_shape(LinearAttrs const &, ParallelTensorShape const &)", and
+  - \ref "get_output_shape(LinearAttrs const &, ParallelTensorShape const &)"
+
+  This procedure is termed <em>parallel shape inference</em>.
+  The recommended way to do this currently is to exploit the fact that a \ref ParallelTensorShape is equivalent to a pair of a \ref TensorShape and a \ref ParallelTensorDimDegrees and replace the implementations of the above parallel shape inference functions with the following:
+  - \ref "get_projection_parallel_dim_degrees(LinearAttrs const &attrs, ParallelTensorDimDegrees const &input)"
+  - \ref "get_bias_parallel_dim_degrees(LinearAttrs const &attrs, ParallelTensorDimDegrees const &input)"
+  - \ref "get_output_parallel_dim_degrees(LinearAttrs const &attrs, ParallelTensorDimDegrees const &input)"
+
+  This allows us to implement parallel shape inference as a simple composition of functions, as in the following snippet from the %Linear operator:
+  \snippet lib/op-attrs/src/op-attrs/ops/linear.cc parallel shape inference composition example
+
+- A function for inferring the slot names for the incoming tensors (
+  \ref "std::unordered_map<TensorSlotName, IncomingTensorRole> get_linear_incoming_tensor_roles(LinearAttrs const &)")
+- Functions for computing the dependencies between shards of the parallelized input, weight, and output tensors, e.g.,
+  - \ref "OperatorSpaceToParallelTensorSpaceMapping get_operator_to_input_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+  - \ref "OperatorSpaceToParallelTensorSpaceMapping get_operator_to_projection_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+  - \ref "OperatorSpaceToParallelTensorSpaceMapping get_operator_to_output_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+
+Note that as different operators have different numbers of inputs, etc. the number and signatures of these functions may be different for different operators. While keeping the structure of the various operators similar is makes it easier to understand, it's not strictly necessary: the code that calls these functions for a generic operator allows custom behavior for each operator, which allows us to have a bit more freedom to evolve operator definitions over time:
+- \ref get_operator_to_ptensor_mappings (and associated functions in \ref get_operator_space_to_parallel_tensor_space_mappings.h)
+- \ref "get_incoming_tensor_roles(ComputationGraphOpAttrs const &)" (and associated functions in \ref get_incoming_tensor_roles.h)
+- \ref "get_output_shapes(ComputationGraphOpAttrs const &, std::unordered_map<TensorSlotName, TensorShape> const &input_shapes)" (and associated functions in \ref op-attrs/shape_inference.h)
+
+
+
+
 */
+}
diff --git a/lib/op-attrs/include/op-attrs/ops/linear_attrs.dtg.toml b/lib/op-attrs/include/op-attrs/ops/linear_attrs.dtg.toml
index 9c8e0587c6..d771335cf6 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear_attrs.dtg.toml
+++ b/lib/op-attrs/include/op-attrs/ops/linear_attrs.dtg.toml
@@ -9,6 +9,11 @@ features = [
   "rapidcheck",
   "fmt",
 ]
+docstring = """
+@brief Compiler-side representation of a %Linear operator.
+
+For details on how operators are represented on the compiler side, see @ref op-attrs-ops.
+"""
 
 includes = [
   "op-attrs/datatype.dtg.h",
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index 2518df77e4..a9f8fdf02a 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -109,6 +109,7 @@ tl::expected<std::unordered_map<TensorSlotName, TensorShape>, std::string>
   return weight_shapes;
 }
 
+//! [parallel shape inference composition example]
 tl::expected<ParallelTensorShape, std::string>
     get_projection_shape(LinearAttrs const &attrs,
                          ParallelTensorShape const &input) {
@@ -126,6 +127,7 @@ tl::expected<ParallelTensorShape, std::string>
 
   return lift_to_parallel_with_degrees(unpar, projection_degrees);
 }
+//! [parallel shape inference composition example]
 
 tl::expected<ParallelTensorShape, std::string>
     get_bias_shape(LinearAttrs const &attrs, ParallelTensorShape const &input) {
diff --git a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
index 8c8ccf6ac4..638e2f3b22 100644
--- a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
+++ b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
@@ -8,6 +8,9 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief Turn a %Realm region instance into a GenericTensorAccessor.
+ */
 DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
     Realm::RegionInstance inst,
     Realm::Event ready,
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance.h
similarity index 83%
rename from lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
rename to lib/realm-execution/include/realm-execution/pcg_instance.h
index b795d53d56..a966c9a01b 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_PCG_INSTANCE_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_PCG_INSTANCE_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
 
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
@@ -24,14 +24,17 @@
 namespace FlexFlow {
 
 /**
- * @brief The main public interface for the realm backend.
- *
- * It takes a MappedParallelComputationGraph and lowers it through the
+ * \brief The main public interface for the realm backend.
+ * Takes a MappedParallelComputationGraph and lowers it through
  * DynamicOpenDataflowGraph to get the fully-specified execution order of tasks
- * to be executed. Besides the usual dynamic graph passes (\ref
- * perform_pass_expansion, \ref perform_update_insertion, \ref
- * perform_shard_expansion), this class also tracks the allocation of realm
- * instances for tensors.
+ * to be executed. Also tracks the allocation of realm instances for tensors
+ * through its TensorInstanceBacking.
+ *
+ * \note PCGInstance is primarily just a container for the various structs held
+ * inside it. The actual initialization and training iteration functionality is
+ * held in \ref create_pcg_instance and \ref
+ * perform_update_pass_for_pcg_instance, respectively.
+ *
  */
 struct PCGInstance {
 public:
@@ -49,13 +52,16 @@ struct PCGInstance {
 
   ~PCGInstance();
 
+  void update_optimizer_attrs_for_next_iter();
+
+  // getters
   RealmContext &get_realm_context();
   std::vector<DynamicNodeInvocation> const &get_execution_order() const;
   TensorInstanceBacking const &get_tensor_instance_backing() const;
   PerDeviceOpStateBacking const &get_device_state_backing() const;
   OptimizerAttrs const &get_optimizer_attrs() const;
-  void update_optimizer_attrs_for_next_iter();
   std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
+
 private:
   RealmContext &ctx;
   std::vector<DynamicNodeInvocation> execution_order;
@@ -79,6 +85,14 @@ PCGInstance create_pcg_instance(
     DistributedDeviceHandle const &device_handle,
     FFIterationConfig const &iteration_config);
 
+/**
+ * \brief Dispatch a training iteration for a PCGInstance.
+ *
+ * To dispatch just a piece of a training iteration, see the following functions:
+ * - \ref perform_forward_pass_for_pcg_instance
+ * - \ref perform_backward_pass_for_pcg_instance
+ * - \ref perform_update_pass_for_pcg_instance
+ */
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_all_passes_for_pcg_instance(
         PCGInstance &pcg_instance,
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 25bbecfb82..2aba46e47a 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -18,7 +18,7 @@ namespace FlexFlow {
  * @brief An interface that wraps the rest of realm and protects against certain
  * classes of bugs, such as shutdown bugs.
  *
- * @note Do NOT call Realm directly unless you know what you are doing.
+ * @warn Do NOT call Realm directly unless you know what you are doing.
  */
 struct RealmContext {
 public:
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 5c673d4134..89272c591a 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -11,9 +11,8 @@ namespace FlexFlow {
 
 /**
  * @brief Manages the initialization and shutdown of the realm runtime.
- *
- * Provides the interface to launch the controller that runs the rest of the computation
- * (i.e., \ref start_controller).
+ * Provides the interface to launch the \ref term-controller that runs the rest of the computation
+* (i.e., \ref start_controller).
  */
 struct RealmManager : private RealmContext {
 public:
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
index 7134973ead..07a324f973 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
@@ -6,9 +6,17 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief A stub function to work around Realm not allowing lambdas to be be registered as Realm tasks.
+ * Takes the desired lambda to run as the \ref term-controller as an argument and immediately calls it.
+ */
 void controller_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
+/**
+ * \brief Dispatches the \ref controller task. Packages up the provided \ref std::function and
+ * passes it along to \ref controller_task_body.
+ */
 Realm::Event
     collective_spawn_controller_task(RealmContext &ctx,
                                      Realm::Processor &target_proc,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
similarity index 88%
rename from lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
rename to lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
index a87652b5ce..cf45cd8b67 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
@@ -7,10 +7,10 @@
 
 namespace FlexFlow {
 
-void device_handle_init_return_task_body(
+void ff_handle_init_return_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event spawn_device_handle_init_return_task(
+Realm::Event spawn_ff_handle_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
     DeviceSpecificManagedPerDeviceFFHandle const &result,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
similarity index 80%
rename from lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
rename to lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
index 312ed26add..89485100af 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_FF_HANDLE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_FF_HANDLE_INIT_TASK_H
 
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
 #include "realm-execution/realm.h"
@@ -7,10 +7,10 @@
 
 namespace FlexFlow {
 
-void device_handle_init_task_body(
+void ff_handle_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event spawn_device_handle_init_task(
+Realm::Event spawn_ff_handle_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
     size_t workSpaceSize,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml
similarity index 93%
rename from lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml
rename to lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml
index c0ba37bb5d..808a350091 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "DeviceHandleInitTaskArgs"
+name = "FfHandleInitTaskArgs"
 type = "struct"
 features = []
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
new file mode 100644
index 0000000000..89e4e9642e
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
@@ -0,0 +1,16 @@
+namespace {
+/**
+
+\page realm-execution-tasks tasks/
+
+\c realm-execution groups tasks into four kinds:
+
+- \ref controller_task.h "Controller Tasks": At most one of these per machine. Runs the \ref term-controller.
+- \ref op_task.h "Operator Tasks":
+- \ref ff_handle_init_task.h "FF Handle Init Tasks":
+- \ref ff_handle_init_return_task.h "FF Handle Init Return Tasks":
+- \ref per_device_op_state_init_task.h "Per Device Op State Init Tasks":
+- \ref per_device_op_state_init_return_task.h "Per Device Op State Init Return Tasks":
+
+}
+*/
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index e089756741..e413f6da31 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -16,12 +16,16 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief The function registered as a %Realm task for operator-related tasks.
+ * Dispatched by \ref spawn_op_task.
+ */
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
  * @brief Launches the task for a DynamicNodeInvocation using realm.
  *
- * @note The task launch process functions a bit differently to that used in the
+ * The task launch process functions a bit differently to that used in the
  * previous FlexFlow codebase. Rather than having a function registered with
  * realm/legion for every task_id_t, we now have only a few functions
  * registered: @ref op_task_body, @ref device_handle_init_task_body,
@@ -37,7 +41,7 @@ void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
  * (which then uses @ref call_fwd_task_impl) to actually call the function in
  * lib/task-spec/src/task-spec/ops/impl/conv_2d.cc
  *
- * @note That the above also means that we don't have a separate
+ * The above also means that we don't have a separate
  * ITaskArgumentAccessor subclass for realm-execution. Instead we ship over the
  * information on the corresponding realm instances over to the remote node,
  * grab the corresponding pointer/GenericTensorAccessor, and then use
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
similarity index 75%
rename from lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
rename to lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
index 4de7e5689f..7027ad7555 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_RETURN_TASK_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_RETURN_TASK_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_PER_DEVICE_OP_STATE_INIT_RETURN_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_PER_DEVICE_OP_STATE_INIT_RETURN_TASK_H
 
 #include "realm-execution/device_specific_ptr.h"
 #include "realm-execution/realm.h"
@@ -8,10 +8,10 @@
 
 namespace FlexFlow {
 
-void device_state_init_return_task_body(
+void per_device_op_state_init_return_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
-Realm::Event spawn_device_state_init_return_task(
+Realm::Event spawn_per_device_op_state_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
     DeviceSpecificPtr<PerDeviceOpState> const &result,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
similarity index 86%
rename from lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
rename to lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
index 657d2e8401..1c7db6e0d4 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_TASK_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_TASK_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_PER_DEVICE_OP_STATE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_PER_DEVICE_OP_STATE_INIT_TASK_H
 
 #include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
@@ -14,10 +14,10 @@
 
 namespace FlexFlow {
 
-void device_state_init_task_body(
+void per_device_op_state_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
-std::optional<Realm::Event> spawn_device_state_init_task(
+std::optional<Realm::Event> spawn_per_device_op_state_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
     DynamicNodeInvocation const &invocation,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
similarity index 100%
rename from lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
rename to lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
deleted file mode 100644
index 63d70fe10a..0000000000
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
-
-#include "realm-execution/tasks/impl/device_handle_init_task_args.dtg.h"
-#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.h"
-
-namespace FlexFlow {
-
-SerializableDeviceHandleInitTaskArgs
-    device_handle_init_task_args_to_serializable(
-        DeviceHandleInitTaskArgs const &);
-DeviceHandleInitTaskArgs device_handle_init_task_args_from_serializable(
-    SerializableDeviceHandleInitTaskArgs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.dtg.toml
similarity index 90%
rename from lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
rename to lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.dtg.toml
index 34f52880f8..9d7414aac6 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.dtg.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "SerializableDeviceHandleInitTaskArgs"
+name = "SerializableFfHandleInitTaskArgs"
 type = "struct"
 features = [
   "eq",
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h
new file mode 100644
index 0000000000..625475f0ae
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
+
+#include "realm-execution/tasks/impl/ff_handle_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_ff_handle_init_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableFfHandleInitTaskArgs
+    ff_handle_init_task_args_to_serializable(
+        FfHandleInitTaskArgs const &);
+
+FfHandleInitTaskArgs ff_handle_init_task_args_from_serializable(
+    SerializableFfHandleInitTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml
similarity index 100%
rename from lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
rename to lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
similarity index 100%
rename from lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
rename to lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
index b8533dbcc9..929b8e5ce3 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -6,6 +6,11 @@ features = [
   "fmt",
   #"hash",
 ]
+docstring = """
+@brief A simple container for mapping between DynamicValueAttrs and the corresponding realm instances/events.
+
+@note The actual logic for doing instance allocation is in @ref perform_instance_allocation.
+"""
 
 includes = [
   "<unordered_map>",
diff --git a/lib/realm-execution/index.dox b/lib/realm-execution/index.dox
index a225ef5b30..e8f24507cc 100644
--- a/lib/realm-execution/index.dox
+++ b/lib/realm-execution/index.dox
@@ -1,28 +1,34 @@
+namespace FlexFlow {
 /**
 
 \page realm-execution realm-execution
 
+\brief Executes distributed \ref MappedParallelComputationGraph ""s using realm, primarily by lowering them to distributed \ref DynamicOpenDataflowGraph ""s using \ref task-spec
+       Used both for testing and (eventually by \ref realm-execution) for fusing operator task launches.
+
 The %Realm backend for distributed execution.
 
 This is a single-controller implementation. That means the controller (the task that launches all other work) runs on a single node and remotely launches work onto other nodes. Aside from caveats mentioned below, this implementation is (mostly) capable of distributed execution.
 
-\section realm-execution-major-components Major Components
+\section realm-execution-usage Example Usage
 
-- \ref "FlexFlow::PCGInstance": the main public interface for the Realm backend. It takes a mapped PCG and lowers it through the dynamic graph to get the fully-specified execution order of tasks to be executed. Besides the usual dynamic graph passes (pass expansion, update insertion, shard expansion), this class also tracks the allocation of Realm instances for tensors.
-- \ref "FlexFlow::RealmManager": manages the initialization and shutdown of the Realm runtime. Provides the interface to launch the controller that runs the rest of the computation.
-- \ref "FlexFlow::RealmContext": an interface that wraps the rest of Realm and protects against certain classes of bugs, such as shutdown bugs. **Do NOT call Realm directly unless you know what you are doing.**
-- @ref "include/realm-execution/tasks": the Realm task implementations and their supporting infrastructure.
-  - @ref "lib/realm-execution/include/realm-execution/tasks/impl" "impl/": the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
-  - @ref "lib/realm-execution/include/realm-exectuion/tasks/serializer/" "serializer/": additional support for serializing Realm data types.
-  - @ref realm_task_registry.h: manages the registration of %Realm tasks. All %Realm tasks go through this interface.
-  - @ref task_id_t.h and @ref realm_task_id_t.h: types to represent %Realm tasks, along with an encoding to %Realm's native task ID type.
+\snippet{local} lib/realm-execution/test/src/realm-execution/test_e2e.cc realm-execution example
 
-\section realm-execution-other-components Other components used mainly within \ref FlexFlow::PCGInstance
+\section realm-execution-major-components Major Components
 
- - @ref "::FlexFlow::DistributedDeviceHandle": represents a distributed device handle (i.e., device handles on all the GPUs on the system), for convenience.
- - @ref "::FlexFlow::DependencySet": tracks dependencies during execution of tasks.
- - @ref "distributed_device_state_initialization.h": performs device state initialization of dynamic graph nodes and returns the resulting FlexFlow::PerDeviceOpStateBacking.
- - @ref "instance_allocation.h": allocates instances for tensors in the dynamic graph and returns the resulting FlexFlow::TensorInstanceBacking.
+- \ref PCGInstance "": \copybrief PCGInstance
+- \ref RealmManager "": \copybrief RealmManager
+- \ref RealmContext "": \copybrief RealmContext
+- \ref "include/realm-execution/tasks": The Realm task implementations and their supporting infrastructure.
+  - \ref "lib/realm-execution/include/realm-execution/tasks/impl" "impl/": the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
+  - \ref "lib/realm-execution/include/realm-execution/tasks/serializer/" "serializer/": additional support for serializing %Realm data types.
+  - \ref realm_task_registry.h: Manages the registration of %Realm tasks. All %Realm tasks go through this interface.
+  - \ref task_id_t.h and \ref realm_task_id_t.h: Types to represent %Realm tasks, along with an encoding to %Realm's native task ID type.
+- Helper components (mainly used within PCGInstance)
+  - \ref DistributedDeviceHandle "": represents a distributed device handle (i.e., device handles on all the GPUs on the system), for convenience.
+  - \ref DependencySet "": tracks dependencies during execution of tasks.
+  - \ref "distributed_device_state_initialization.h": performs device state initialization of dynamic graph nodes and returns the resulting PerDeviceOpStateBacking.
+  - \ref "instance_allocation.h": allocates instances for tensors in the dynamic graph and returns the resulting TensorInstanceBacking.
 
 \section realm-execution-todo TODO
 
@@ -35,4 +41,11 @@ This is a single-controller implementation. That means the controller (the task
 - control replication
 - Realm subgraphs
 
+\section terminology Terminology
+
+\subsection term-controller controller
+
+The main thread/function that, in an non-controlled implementation, processes the task graph and dispatches all of the tasks.
+
 */
+}
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance.cc
similarity index 99%
rename from lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
rename to lib/realm-execution/src/realm-execution/pcg_instance.cc
index d78ed68988..ead569f4ce 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance.cc
@@ -1,4 +1,4 @@
-#include "realm-execution/pcg_instance/pcg_instance.h"
+#include "realm-execution/pcg_instance.h"
 #include "op-attrs/tensor_slot_name.dtg.h"
 #include "pcg/optimizer_attrs.h"
 #include "realm-execution/dependency_set.h"
@@ -47,23 +47,29 @@ PCGInstance::~PCGInstance() {
 RealmContext &PCGInstance::get_realm_context() {
   return this->ctx;
 }
+
 std::vector<DynamicNodeInvocation> const &
     PCGInstance::get_execution_order() const {
   return this->execution_order;
 }
+
 TensorInstanceBacking const &PCGInstance::get_tensor_instance_backing() const {
   return this->tensor_instance_backing;
 }
+
 PerDeviceOpStateBacking const &PCGInstance::get_device_state_backing() const {
   return this->device_state_backing;
 }
+
 OptimizerAttrs const &PCGInstance::get_optimizer_attrs() const {
   return this->optimizer_attrs;
 }
+
 void PCGInstance::update_optimizer_attrs_for_next_iter() {
   this->optimizer_attrs =
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
+
 std::optional<Realm::RegionInstance>
     PCGInstance::get_loss_tensor_instance() const {
   return this->logit_grad_tensor;
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
similarity index 73%
rename from lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
rename to lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
index bda6f7781c..552da3cb01 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
@@ -1,12 +1,12 @@
-#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_task.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 
 namespace FlexFlow {
 
-struct DeviceHandleInitReturnTaskArgs {
+struct FfHandleInitReturnTaskArgs {
 public:
-  DeviceHandleInitReturnTaskArgs() = delete;
-  DeviceHandleInitReturnTaskArgs(
+  FfHandleInitReturnTaskArgs() = delete;
+  FfHandleInitReturnTaskArgs(
       DeviceSpecificManagedPerDeviceFFHandle result,
       Realm::Processor origin_proc,
       DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr)
@@ -19,26 +19,26 @@ struct DeviceHandleInitReturnTaskArgs {
   DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr;
 };
 
-void device_handle_init_return_task_body(void const *args,
+void ff_handle_init_return_task_body(void const *args,
                                          size_t arglen,
                                          void const *userdata,
                                          size_t userlen,
                                          Realm::Processor proc) {
-  ASSERT(arglen == sizeof(DeviceHandleInitReturnTaskArgs));
-  DeviceHandleInitReturnTaskArgs task_args =
-      *reinterpret_cast<DeviceHandleInitReturnTaskArgs const *>(args);
+  ASSERT(arglen == sizeof(FfHandleInitReturnTaskArgs));
+  FfHandleInitReturnTaskArgs task_args =
+      *reinterpret_cast<FfHandleInitReturnTaskArgs const *>(args);
 
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
   *task_args.origin_result_ptr = task_args.result;
 }
 
-Realm::Event spawn_device_handle_init_return_task(
+Realm::Event spawn_ff_handle_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
     DeviceSpecificManagedPerDeviceFFHandle const &result,
     DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
     Realm::Event precondition) {
-  DeviceHandleInitReturnTaskArgs task_args{
+  FfHandleInitReturnTaskArgs task_args{
       result, origin_proc, origin_result_ptr};
 
   return ctx.spawn_task(origin_proc,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
similarity index 74%
rename from lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
rename to lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
index 87460cc5a7..ca0a5bf2cd 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
@@ -1,8 +1,8 @@
-#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_task.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
-#include "realm-execution/tasks/impl/device_handle_init_return_task.h"
-#include "realm-execution/tasks/impl/device_handle_init_task_args.dtg.h"
-#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.h"
+#include "realm-execution/tasks/impl/ff_handle_init_return_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h"
 #include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include <type_traits>
@@ -10,7 +10,7 @@
 namespace FlexFlow {
 
 static std::optional<ManagedPerDeviceFFHandle *>
-    make_device_handle_for_processor(Realm::Processor processor,
+    make_ff_handle_for_processor(Realm::Processor processor,
                                      size_t workSpaceSize,
                                      bool allowTensorOpMathConversion) {
   switch (processor.kind()) {
@@ -28,33 +28,33 @@ static std::optional<ManagedPerDeviceFFHandle *>
   }
 }
 
-void device_handle_init_task_body(void const *args,
+void ff_handle_init_task_body(void const *args,
                                   size_t arglen,
                                   void const *userdata,
                                   size_t userlen,
                                   Realm::Processor proc) {
-  DeviceHandleInitTaskArgs task_args =
-      device_handle_init_task_args_from_serializable(
-          deserialize_task_args<SerializableDeviceHandleInitTaskArgs>(args,
+  FfHandleInitTaskArgs task_args =
+      ff_handle_init_task_args_from_serializable(
+          deserialize_task_args<SerializableFfHandleInitTaskArgs>(args,
                                                                       arglen));
 
   RealmContext ctx{proc};
   DeviceSpecificManagedPerDeviceFFHandle managed_handle =
       make_device_specific_managed_handle(
           ctx.get_current_device_idx(),
-          make_device_handle_for_processor(
+          make_ff_handle_for_processor(
               proc,
               task_args.workSpaceSize,
               task_args.allowTensorOpMathConversion));
 
-  spawn_device_handle_init_return_task(ctx,
+  spawn_ff_handle_init_return_task(ctx,
                                        task_args.origin_proc,
                                        managed_handle,
                                        task_args.origin_result_ptr,
                                        Realm::Event::NO_EVENT);
 }
 
-Realm::Event spawn_device_handle_init_task(
+Realm::Event spawn_ff_handle_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
     size_t workSpaceSize,
@@ -62,7 +62,7 @@ Realm::Event spawn_device_handle_init_task(
     DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
     Realm::Event precondition) {
 
-  DeviceHandleInitTaskArgs task_args = DeviceHandleInitTaskArgs{
+  FfHandleInitTaskArgs task_args = FfHandleInitTaskArgs{
       workSpaceSize,
       allowTensorOpMathConversion,
       ctx.get_current_processor(),
@@ -70,7 +70,7 @@ Realm::Event spawn_device_handle_init_task(
   };
 
   std::string serialized_args = serialize_task_args(
-      device_handle_init_task_args_to_serializable(task_args));
+      ff_handle_init_task_args_to_serializable(task_args));
   return ctx.spawn_task(target_proc,
                         task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
                         serialized_args.data(),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
similarity index 71%
rename from lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
rename to lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
index a1a7eb84a8..af04a835f2 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
@@ -1,12 +1,12 @@
-#include "realm-execution/tasks/impl/device_state_init_return_task.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_return_task.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 
 namespace FlexFlow {
 
-struct DeviceStateInitReturnTaskArgs {
+struct PerDeviceOpStateInitReturnTaskArgs {
 public:
-  DeviceStateInitReturnTaskArgs() = delete;
-  DeviceStateInitReturnTaskArgs(
+  PerDeviceOpStateInitReturnTaskArgs() = delete;
+  PerDeviceOpStateInitReturnTaskArgs(
       DeviceSpecificPtr<PerDeviceOpState> result,
       Realm::Processor origin_proc,
       DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr)
@@ -19,26 +19,26 @@ struct DeviceStateInitReturnTaskArgs {
   DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr;
 };
 
-void device_state_init_return_task_body(void const *args,
+void per_device_op_state_init_return_task_body(void const *args,
                                         size_t arglen,
                                         void const *userdata,
                                         size_t userlen,
                                         Realm::Processor proc) {
-  ASSERT(arglen == sizeof(DeviceStateInitReturnTaskArgs));
-  DeviceStateInitReturnTaskArgs task_args =
-      *reinterpret_cast<DeviceStateInitReturnTaskArgs const *>(args);
+  ASSERT(arglen == sizeof(PerDeviceOpStateInitReturnTaskArgs));
+  PerDeviceOpStateInitReturnTaskArgs task_args =
+      *reinterpret_cast<PerDeviceOpStateInitReturnTaskArgs const *>(args);
 
   ASSERT(task_args.origin_proc.address_space() == proc.address_space());
   *task_args.origin_result_ptr = task_args.result;
 }
 
-Realm::Event spawn_device_state_init_return_task(
+Realm::Event spawn_per_device_op_state_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
     DeviceSpecificPtr<PerDeviceOpState> const &result,
     DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr,
     Realm::Event precondition) {
-  DeviceStateInitReturnTaskArgs task_args{
+  PerDeviceOpStateInitReturnTaskArgs task_args{
       result, origin_proc, origin_result_ptr};
 
   return ctx.spawn_task(origin_proc,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
similarity index 82%
rename from lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
rename to lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
index 50c8daffb0..50231c554b 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
@@ -1,9 +1,9 @@
-#include "realm-execution/tasks/impl/device_state_init_task.h"
-#include "local-execution/device_state_initialization.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_task.h"
+#include "local-execution/per_device_op_state_initialization.h"
 #include "realm-execution/dynamic_tensor_accessor_from_instance.h"
-#include "realm-execution/tasks/impl/device_state_init_return_task.h"
-#include "realm-execution/tasks/impl/device_state_init_task_args.dtg.h"
-#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_return_task.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h"
 #include "realm-execution/tasks/serializer/task_arg_serializer.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.h"
@@ -18,14 +18,14 @@
 
 namespace FlexFlow {
 
-void device_state_init_task_body(void const *args,
+void per_device_op_state_init_task_body(void const *args,
                                  size_t arglen,
                                  void const *userdata,
                                  size_t userlen,
                                  Realm::Processor proc) {
-  DeviceStateInitTaskArgs task_args =
-      device_state_init_task_args_from_serializable(
-          deserialize_task_args<SerializableDeviceStateInitTaskArgs>(args,
+  PerDeviceOpStateInitTaskArgs task_args =
+      per_device_op_state_init_task_args_from_serializable(
+          deserialize_task_args<SerializablePerDeviceOpStateInitTaskArgs>(args,
                                                                      arglen));
 
   RealmContext ctx{proc};
@@ -62,18 +62,18 @@ void device_state_init_task_body(void const *args,
   // Important: to make sure this doesn't get deallocated, we intentionally leak
   // the allocation here
   PerDeviceOpState *result_state_ptr =
-      new PerDeviceOpState{get_device_state_from_device_specific(
+      new PerDeviceOpState{get_per_device_op_state_from_device_specific(
           result_state, ctx.get_current_device_idx())};
   DeviceSpecificPtr<PerDeviceOpState> result_device_specific{
       ctx.get_current_device_idx(), result_state_ptr};
-  spawn_device_state_init_return_task(ctx,
+  spawn_per_device_op_state_init_return_task(ctx,
                                       task_args.origin_proc,
                                       result_device_specific,
                                       task_args.origin_result_ptr,
                                       Realm::Event::NO_EVENT);
 }
 
-std::optional<Realm::Event> spawn_device_state_init_task(
+std::optional<Realm::Event> spawn_per_device_op_state_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
     DynamicNodeInvocation const &invocation,
@@ -84,7 +84,7 @@ std::optional<Realm::Event> spawn_device_state_init_task(
     OptimizerAttrs const &optimizer_attrs,
     DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
     Realm::Event precondition) {
-  DeviceStateInitTaskArgs task_args{
+  PerDeviceOpStateInitTaskArgs task_args{
       invocation,
       tensor_backing,
       profiling_settings,
@@ -103,7 +103,7 @@ std::optional<Realm::Event> spawn_device_state_init_task(
                get_init_task_id_for_op_attrs);
   if (task_id.has_value()) {
     std::string args = serialize_task_args(
-        device_state_init_task_args_to_serializable(task_args));
+        per_device_op_state_init_task_args_to_serializable(task_args));
     return ctx.spawn_task(target_proc,
                           assert_unwrap(task_id),
                           args.data(),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
similarity index 100%
rename from lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc
rename to lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
similarity index 100%
rename from lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
rename to lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 0914c054d7..11d2f14f1c 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     int fake_argc = fake_args.size();
     char **fake_argv = fake_args.data();
 
-    RealmManager manager(&fake_argc, &fake_argv);
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
 
     (void)manager.start_controller([](RealmContext &ctx) {
       Allocator allocator = ctx.get_current_device_allocator();
@@ -262,6 +262,159 @@ TEST_SUITE(FF_TEST_SUITE) {
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("RealmBackend e2e Training (GPU Model Parallelism)") {
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int hidden_dim = 32_p;
+    positive_int output_dim = 1_p;
+
+    TensorShape output_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+    // construct computation graph
+    ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+    TensorShape label_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+    GenericTensorAccessorW label_tensor =
+        allocator.allocate_tensor(label_tensor_shape);
+
+    TensorShape weight_shape_1 = TensorShape{
+        TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
+    TensorShape weight_shape_2 = TensorShape{
+        TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
+
+    ParallelLayerAddedResult inputs_layer =
+        pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+    parallel_tensor_guid_t t_input =
+        require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+    ParallelLayerAddedResult weights_layer_1 = add_parallel_layer(
+        pcg,
+        ParallelLayerAttrs{
+            PCGOperatorAttrs{WeightAttrs{
+                weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+            std::nullopt},
+        {},
+        {});
+    parallel_tensor_guid_t t_weights_1 =
+        require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT);
+
+    ParallelLayerAddedResult weights_layer_2 = add_parallel_layer(
+        pcg,
+        ParallelLayerAttrs{
+            PCGOperatorAttrs{WeightAttrs{
+                weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+            std::nullopt},
+        {},
+        {});
+    parallel_tensor_guid_t t_weights_2 =
+        require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT);
+
+    ParallelLayerAddedResult linear_operator_1 = add_parallel_layer(
+        pcg,
+        ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{hidden_dim,
+                                                        /*use_bias=*/false,
+                                                        DataType::FLOAT,
+                                                        Activation::RELU,
+                                                        std::nullopt}},
+                           std::nullopt},
+        {
+            {
+                TensorSlotName::INPUT,
+                t_input,
+            },
+        },
+        {
+            {
+                TensorSlotName::WEIGHT,
+                t_weights_1,
+            },
+        });
+    parallel_tensor_guid_t t_linear_1 =
+        require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT);
+
+    ParallelLayerAddedResult linear_operator_2 = add_parallel_layer(
+        pcg,
+        ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{output_dim,
+                                                        /*use_bias=*/false,
+                                                        DataType::FLOAT,
+                                                        Activation::RELU,
+                                                        std::nullopt}},
+                           std::nullopt},
+        {
+            {
+                TensorSlotName::INPUT,
+                t_linear_1,
+            },
+        },
+        {
+            {
+                TensorSlotName::WEIGHT,
+                t_weights_2,
+            },
+        });
+    parallel_tensor_guid_t t_linear_2 =
+        require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
+
+    MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
+    ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+    MappedParallelComputationGraph mpcg{
+        pcg,
+        {
+            {inputs_layer.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{
+                       {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+            {weights_layer_1.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{
+                       {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+            {weights_layer_2.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{
+                       {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+            {linear_operator_1.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{{
+                       {TensorSlotName::INPUT, tensor_coord0},
+                       {TensorSlotName::WEIGHT, tensor_coord0},
+                       {TensorSlotName::OUTPUT, tensor_coord0},
+                   }}}}}},
+            {linear_operator_2.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{{
+                       {TensorSlotName::INPUT, tensor_coord0},
+                       {TensorSlotName::WEIGHT, tensor_coord0},
+                       {TensorSlotName::OUTPUT, tensor_coord0},
+                   }}}}}},
+        },
+    };
+    MappedOperatorTaskGroup loss_mapping{
+        {{gpu0,
+          OperatorAtomicTaskShardBinding{{
+              {TensorSlotName::INPUT, tensor_coord0},
+              {TensorSlotName::LOGIT, tensor_coord0},
+          }}}}};
+
+    // instantiate computation graph
+    LossAttrs loss_attrs = LossAttrs{
+        NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                         /*momentum=*/0.9,
+                                         /*nesterov=*/false,
+                                         /*weight_decay=*/0.001}};
+
+
+//! [realm-execution example]
     std::vector<char *> fake_args =
         make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/1_n);
     int fake_argc = fake_args.size();
@@ -272,160 +425,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     (void)manager.start_controller([](RealmContext &ctx) {
       Allocator allocator = ctx.get_current_device_allocator();
 
-      positive_int batch_size = 10_p;
-      positive_int data_dim = 16_p;
-      positive_int hidden_dim = 32_p;
-      positive_int output_dim = 1_p;
-
-      TensorShape output_tensor_shape = TensorShape{
-          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
-
       GenericTensorAccessorW label_tensor_backing =
           allocator.allocate_tensor(output_tensor_shape);
 
-      // construct computation graph
-      ParallelComputationGraph pcg = empty_parallel_computation_graph();
-
-      TensorShape input_tensor_shape = TensorShape{
-          TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
-
-      TensorShape label_tensor_shape = TensorShape{
-          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
-      GenericTensorAccessorW label_tensor =
-          allocator.allocate_tensor(label_tensor_shape);
-
-      TensorShape weight_shape_1 = TensorShape{
-          TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
-      TensorShape weight_shape_2 = TensorShape{
-          TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
-
-      ParallelLayerAddedResult inputs_layer =
-          pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
-      parallel_tensor_guid_t t_input =
-          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
-
-      ParallelLayerAddedResult weights_layer_1 = add_parallel_layer(
-          pcg,
-          ParallelLayerAttrs{
-              PCGOperatorAttrs{WeightAttrs{
-                  weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
-              std::nullopt},
-          {},
-          {});
-      parallel_tensor_guid_t t_weights_1 =
-          require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT);
-
-      ParallelLayerAddedResult weights_layer_2 = add_parallel_layer(
-          pcg,
-          ParallelLayerAttrs{
-              PCGOperatorAttrs{WeightAttrs{
-                  weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
-              std::nullopt},
-          {},
-          {});
-      parallel_tensor_guid_t t_weights_2 =
-          require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT);
-
-      ParallelLayerAddedResult linear_operator_1 = add_parallel_layer(
-          pcg,
-          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{hidden_dim,
-                                                          /*use_bias=*/false,
-                                                          DataType::FLOAT,
-                                                          Activation::RELU,
-                                                          std::nullopt}},
-                             std::nullopt},
-          {
-              {
-                  TensorSlotName::INPUT,
-                  t_input,
-              },
-          },
-          {
-              {
-                  TensorSlotName::WEIGHT,
-                  t_weights_1,
-              },
-          });
-      parallel_tensor_guid_t t_linear_1 =
-          require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT);
-
-      ParallelLayerAddedResult linear_operator_2 = add_parallel_layer(
-          pcg,
-          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{output_dim,
-                                                          /*use_bias=*/false,
-                                                          DataType::FLOAT,
-                                                          Activation::RELU,
-                                                          std::nullopt}},
-                             std::nullopt},
-          {
-              {
-                  TensorSlotName::INPUT,
-                  t_linear_1,
-              },
-          },
-          {
-              {
-                  TensorSlotName::WEIGHT,
-                  t_weights_2,
-              },
-          });
-      parallel_tensor_guid_t t_linear_2 =
-          require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
-
-      MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
-      ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
-      MappedParallelComputationGraph mpcg{
-          pcg,
-          {
-              {inputs_layer.parallel_layer,
-               MappedOperatorTaskGroup{
-                   {{gpu0,
-                     OperatorAtomicTaskShardBinding{
-                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
-              {weights_layer_1.parallel_layer,
-               MappedOperatorTaskGroup{
-                   {{gpu0,
-                     OperatorAtomicTaskShardBinding{
-                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
-              {weights_layer_2.parallel_layer,
-               MappedOperatorTaskGroup{
-                   {{gpu0,
-                     OperatorAtomicTaskShardBinding{
-                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
-              {linear_operator_1.parallel_layer,
-               MappedOperatorTaskGroup{
-                   {{gpu0,
-                     OperatorAtomicTaskShardBinding{{
-                         {TensorSlotName::INPUT, tensor_coord0},
-                         {TensorSlotName::WEIGHT, tensor_coord0},
-                         {TensorSlotName::OUTPUT, tensor_coord0},
-                     }}}}}},
-              {linear_operator_2.parallel_layer,
-               MappedOperatorTaskGroup{
-                   {{gpu0,
-                     OperatorAtomicTaskShardBinding{{
-                         {TensorSlotName::INPUT, tensor_coord0},
-                         {TensorSlotName::WEIGHT, tensor_coord0},
-                         {TensorSlotName::OUTPUT, tensor_coord0},
-                     }}}}}},
-          },
-      };
-      MappedOperatorTaskGroup loss_mapping{
-          {{gpu0,
-            OperatorAtomicTaskShardBinding{{
-                {TensorSlotName::INPUT, tensor_coord0},
-                {TensorSlotName::LOGIT, tensor_coord0},
-            }}}}};
-
-      // instantiate computation graph
-      LossAttrs loss_attrs = LossAttrs{
-          NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-      OptimizerAttrs optimizer_attrs =
-          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                           /*momentum=*/0.9,
-                                           /*nesterov=*/false,
-                                           /*weight_decay=*/0.001}};
-
       std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
           input_tensors;
 
@@ -481,6 +483,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           check_kv("last_epoch_loss",
                    format_accessor_r_contents(last_epoch_loss)));
     });
+//! [realm-execution example]
   }
 }
 
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/index.dox b/lib/task-spec/include/task-spec/dynamic_graph/index.dox
new file mode 100644
index 0000000000..2930d916cf
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/index.dox
@@ -0,0 +1,17 @@
+namespace FlexFlow {
+/**
+
+\page task-spec-dynamic-graph task-spec/dynamic_graph/
+
+\brief Contains common code for inferring and making explicit information from a \ref MappedParallelComputationGraph, lowering it into a \ref DynamicOpenDataflowGraph that can be executed by \ref realm-execution and/or \ref local-execution.
+
+\section task-spec-lowering-passes Lowering Passes
+
+- \ref pass_expansion.h
+- \ref shard_expansion.h
+- \ref update_insertion.h
+- \ref loss_insertion.h
+- \ref machine_slicing.h
+
+*/
+}
diff --git a/lib/task-spec/include/task-spec/ops/index.dox b/lib/task-spec/include/task-spec/ops/index.dox
new file mode 100644
index 0000000000..bfbc5a60a9
--- /dev/null
+++ b/lib/task-spec/include/task-spec/ops/index.dox
@@ -0,0 +1,9 @@
+namespace FlexFlow {
+/**
+
+\page task-spec-ops task-spec/ops/
+
+\brief Contains the runtime-generic operator implementations, i.e., the adapter code between the runtimes and \ref kernels.
+
+*/
+}
diff --git a/lib/task-spec/include/task-spec/task_argument_accessor/index.dox b/lib/task-spec/include/task-spec/task_argument_accessor/index.dox
new file mode 100644
index 0000000000..3f0f3acd32
--- /dev/null
+++ b/lib/task-spec/include/task-spec/task_argument_accessor/index.dox
@@ -0,0 +1,82 @@
+namespace FlexFlow {
+/**
+
+\page task-argument-accessor TaskArgumentAccessor Interface
+
+\brief TaskArgumentAccessor provides a interface for operator implementations to access arguments while hiding the details of the underlying execution engine (i.e., \ref local-execution or \ref realm-execution).
+
+\section Background and Motivation
+
+\ref TaskArgumentAccessor was originally designed when %FlexFlow was using %Legion, which required
+tasks to have the following signature:
+
+\code
+void example_task(Legion::Task const *task,
+                  std::vector<Legion::PhysicalRegion> const &regions,
+                  Legion::Context ctx,
+                  Legion::Runtime *runtime);
+\endcode
+
+The task implementation would then proceed to access the necessary arguments/context through these four parameters.
+However, this made the code difficult to test, as creating/mocking these input objects was difficult and often even nonsensical in, for example, non-distributed settings.
+\ref TaskArgumentAccessor was designed to provide an intermediate layer, such that you could transform the above code into
+
+\code
+struct LegionArgumentAccessor : public ITaskArgumentAccessor { ... };
+
+void example_task(Legion::Task const *task,
+                  std::vector<Legion::PhysicalRegion> const &regions,
+                  Legion::Context ctx,
+                  Legion::Runtime *runtime)
+{
+  TaskArgumentAccessor accessor
+    = TaskArgumentAccessor::create<LegionArgumentAccessor>(
+        task, regions, ctx, runtime);
+
+  return example_task_impl(accessor);
+}
+
+void example_task_impl(TaskArgumentAccessor const &accessor);
+\endcode
+
+That way, if we wanted to also call this in a non-distributed context, rather than having to create or mock the %Legion arguments, we can just add an additional implementation of ITaskArgumentAccessor which just access the arguments locally, and then we can also execute the task in a local context, all while leaving the actual task implementation unchanged.
+
+\code
+struct LegionArgumentAccessor : public ITaskArgumentAccessor { ... };
+
+void example_task(Legion::Task const *task,
+                  std::vector<Legion::PhysicalRegion> const &regions,
+                  Legion::Context ctx,
+                  Legion::Runtime *runtime)
+{
+  TaskArgumentAccessor accessor
+    = TaskArgumentAccessor::create<LegionArgumentAccessor>(
+        task, regions, ctx, runtime);
+
+  return example_task_impl(accessor);
+}
+
+struct LocalArgumentAccessor : public ITaskArgumentAccessor { ... };
+
+void example_task(MyLocalArgs const &my_args)
+{
+  TaskArgumentAccessor accessor
+    = TaskArgumentAccessor::create<LocalArgumentAccessor>(my_local_args);
+
+  return example_task_impl(accessor);
+}
+
+void example_task_impl(TaskArgumentAccessor const &accessor);
+\endcode
+
+\section Current Design
+
+TaskArgumentAccessor is just a thin, ref-counted wrapper around the abstract ITaskArgumentAccessor interface.
+Instances of ITaskArgumentAccessor provide access to the following arguments:
+
+- One of \ref PCGOperatorAttrs, \ref LossAttrs, or \ref OptimizerAttrs depending on whether this task is for an operator, an optimizer, or a loss function.
+- Two pieces of device-specific state: \ref device_handle_t (aka \ref term-ff-handle FF handle) and \ref term-per-device-op-state PerDeviceOpState. As both of these contain pointers and hold device-specific initialization state, in distributed execution their addresses (rather than their contents) are passed around, and they are only valid on the device they originated on. One \ref term-ff-handle should be created per device, while one \ref per-device-op-state should be create for every operator for every device it runs on.
+- A few simple value types communicating runtime-wide settings: \ref ProfilingSettings, \ref DeviceType, and \ref FFIterationConfig.
+
+*/
+}
diff --git a/lib/task-spec/index.dox b/lib/task-spec/index.dox
index 7e71232cd3..44d9b0a2cb 100644
--- a/lib/task-spec/index.dox
+++ b/lib/task-spec/index.dox
@@ -1,7 +1,14 @@
+namespace FlexFlow {
 /**
 
-@page task-spec task-spec
+\page task-spec task-spec/
 
 \brief An intermediate layer between the compiler and the runtime. Contains code for lowering the \ref MappedParallelComputationGraph exported from the \ref compiler down to a granularity that the runtime can actually execute. Also contains the functions that translate between logical operators (i.e., \ref op-attrs-ops) and actual calls to \ref kernels.
 
+Primary components:
+- \subpage task-spec-ops "": \copybrief task-spec-ops
+- \subpage task-spec-dynamic-graph "": \copybrief task-spec-dynamic-graph
+- \subpage task-argument-accessor "": \copybrief task-argument-accessor
+
 */
+}
diff --git a/lib/utils/include/utils/containers/index.dox b/lib/utils/include/utils/containers/index.dox
index 38d050d67a..9b3865dd78 100644
--- a/lib/utils/include/utils/containers/index.dox
+++ b/lib/utils/include/utils/containers/index.dox
@@ -8,5 +8,11 @@ Some of the most commonly-used functions are listed below, but you should ideall
 
 - \ref containers/transform.h
 - \ref containers/filter.h
+- \ref containers/contains.h
+- \ref containers/generate_map.h
+- \ref containers/get_only.h
+- \ref containers/slice.h
+- \ref containers/merge_disjoint_maps.h
+- \ref containers/is_subseteq_of.h
 
 */
diff --git a/lib/utils/include/utils/orthotope/index.dox b/lib/utils/include/utils/orthotope/index.dox
new file mode 100644
index 0000000000..8dace16bc1
--- /dev/null
+++ b/lib/utils/include/utils/orthotope/index.dox
@@ -0,0 +1,7 @@
+namespace FlexFlow {
+/**
+
+\page utils-orthotope Orthotope and Friends
+
+*/
+}
diff --git a/lib/utils/index.dox b/lib/utils/index.dox
index 07be0c58ce..e374cb5a43 100644
--- a/lib/utils/index.dox
+++ b/lib/utils/index.dox
@@ -1,11 +1,37 @@
+namespace FlexFlow {
 /**
 
-@page utils utils
+\page utils utils
 
-@brief Various utility and support libraries for the rest of the project. Particularly useful are @ref "utils-graph", @ref "utils-containers", and @ref "utils-cli".
+\brief Various utility and support libraries for the rest of the project. Particularly useful are \ref "utils-graph", \ref "utils-containers", and \ref "utils-cli".
 
+Major components:
 - \subpage utils-containers
 - \subpage utils-cli
 - \subpage utils-graph
+- \subpage utils-restricted-int-types
+- \subpage utils-restricted-map-types
+- \subpage utils-orthotope
 
 */
+
+/**
+
+\page utils-restricted-int-types Restricted Integer Types
+
+- \ref nonnegative_int
+- \ref positive_int
+- \ref int_ge_two
+
+*/
+
+/**
+
+\page utils-restricted-map-types Restricted Map Types
+
+- \ref bidict
+- \ref OneToMany
+- \ref ManyToOne
+
+*/
+}

From 56b48d1201ad9e6b0cd62d799a35cc63b777e3ad Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Thu, 5 Mar 2026 02:05:24 -0800
Subject: [PATCH 092/113] Convert the root md files

---
 bin/index.dox                                 |   2 +-
 contributing.dox                              | 216 ++++++++++++++
 docs/doxygen/Doxyfile                         |   7 +-
 docs/{SAPLING.md => sapling.dox}              |  10 +-
 index.dox                                     |  38 ++-
 lib/compiler/index.dox                        |   2 +-
 lib/index.dox                                 |   2 +-
 lib/kernels/index.dox                         |   2 +-
 lib/op-attrs/include/op-attrs/ops/index.dox   |   2 +-
 lib/op-attrs/index.dox                        |   2 +-
 lib/pcg/include/pcg/file_format/v1/index.dox  |   2 +-
 lib/pcg/index.dox                             |  11 +-
 .../tasks/impl/controller_task.h              |   2 +-
 .../realm-execution/tasks/impl/op_task.h      |  18 +-
 .../include/task-spec/dynamic_graph/index.dox |   2 +-
 lib/task-spec/include/task-spec/ops/index.dox |   2 +-
 lib/task-spec/index.dox                       |   2 +-
 lib/utils/include/utils/graph/README.md       | 277 ------------------
 lib/utils/include/utils/graph/index.dox       | 201 ++++++++++++-
 19 files changed, 487 insertions(+), 313 deletions(-)
 create mode 100644 contributing.dox
 rename docs/{SAPLING.md => sapling.dox} (95%)
 delete mode 100644 lib/utils/include/utils/graph/README.md

diff --git a/bin/index.dox b/bin/index.dox
index 04ee338fb6..5e688a54d6 100644
--- a/bin/index.dox
+++ b/bin/index.dox
@@ -1,6 +1,6 @@
 /**
 
-\page bin bin/
+\page bin bin
 
 This directory contains command-line interfaces for %FlexFlow %Train and associated tools (all in C++).
 
diff --git a/contributing.dox b/contributing.dox
new file mode 100644
index 0000000000..ae643097bb
--- /dev/null
+++ b/contributing.dox
@@ -0,0 +1,216 @@
+/**
+
+\page contributing Developers Guide
+
+\section contributing-setup Setup
+
+\note If you are developing on Stanford's sapling cluster, instead see the instructions \ref sapling-setup "here". If you don't know what this means, you're not using sapling so you should just continue reading.
+
+1. %FlexFlow %Train uses <a href="https://nix.dev/manual/nix/2.24/">nix</a> to manage dependencies and the development environment.
+   There exist a number of ways to install nix, but we recommend one of the following:
+
+   1. If you have root permissions: [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)
+
+   2. If you don't have root permissions: [DavHau/nix-portable](https://github.com/DavHau/nix-portable).
+      Note that nix-portable does not work particularly well if the nix store is in NFS \ref contributing-footnote-1 "[1]" or other distributed file systems,
+      so if you are running on an HPC cluster where the home directory is mounted via a distributed file system we recommend setting the
+      <tt>NP_LOCATION</tt> environment to <tt>/tmp</tt> or some other non-NFS location.
+
+      While you should at least skim nix-portable's setup instructions, you'll probably end up doing something like this:
+
+      \verbatim
+      $ USERBIN="${XDG_BIN_HOME:-$HOME/.local/bin}"
+      $ wget 'https://github.com/DavHau/nix-portable/releases/download/v010/nix-portable' -O "$USERBIN/nix-portable"
+      ...
+      $ chmod u+x "$USERBIN/nix-portable"
+      ...
+      $ ln -sf "$USERBIN/nix-portable" "$USERBIN/nix"
+      ...
+      $ echo 'export PATH=$USERBIN:$PATH' >> ~/.bashrc
+      ...
+      \endverbatim
+
+      Now if everything is setup properly, you should be able to see something like the following (don't worry if the version number is slightly different) if you run <tt>nix \--version</tt>:
+
+      \verbatim
+      $ nix --version
+      nix (Nix) 2.20.6
+      \endverbatim
+
+2. Clone the %FlexFlow %Train repository (or, if you'd prefer, follow the alternative setup instructions in the [ff-dev](#ff-dev-optional) section)
+
+\verbatim
+$ FF_DIR="$HOME/flexflow-train" # or wherever else you want to put the repository
+$ git clone --recursive git@github.com:flexflow/flexflow-train.git "$FF_DIR"
+...
+\endverbatim
+
+3. Enter the nix-provided `default` development environment \ref contributing-footnote-2 "[2]"
+
+\verbatim
+$ cd "$FF_DIR"
+$ nix develop --accept-flake-config
+\endverbatim
+
+4. Build and run the non-GPU-required tests (systems that have access to CUDA GPUs can also run the GPU-mandatory tests by following the instructions \ref contributing-gpu-setup "here")
+
+\verbatim
+(ff) $ proj cmake
+...
+(ff) $ proj test --skip-gpu-tests
+...
+\endverbatim
+
+If everything is correctly configured, you should see a bunch of build messages followed by something like
+
+\verbatim
+(ff) $ proj test --skip-gpu-tests
+421/421 Test #441: get_transformer_computation_graph
+100% tests passed, 0 tests failed out of 421
+
+Label Time Summary:
+compiler-tests                  =   6.13 sec*proc (19 tests)
+local-execution-tests           =   0.13 sec*proc (3 tests)
+models-tests                    =   0.05 sec*proc (4 tests)
+op-attrs-tests                  =   0.48 sec*proc (59 tests)
+pcg-tests                       =   0.33 sec*proc (33 tests)
+substitution-generator-tests    =   0.06 sec*proc (2 tests)
+substitutions-tests             =   0.10 sec*proc (9 tests)
+utils-tests                     =   1.20 sec*proc (293 tests)
+
+Total Test time (real) =   8.64 sec
+\endverbatim
+
+If you don't, or if you see any tests failing, please double check that you have followed the instructions above.
+If you have and are still encountering an issue, please [contact us](#contact-us) with a detailed description of your platform and the commands you have run.
+
+\subsection contributing-editorconfig EditorConfig
+
+%FlexFlow %Train uses [EditorConfig](https://editorconfig.org/) to ensure consistent low-level details (indentation settings, character encoding, etc.) across different editors.
+The EditorConfig file for %FlexFlow %Train can be found in [`.editorconfig`](./.editorconfig).
+If you are using vim, emacs, or another editor with built-in EditorConfig support (a full list of editors with built-in EditorConfig support can be found [here](https://editorconfig.org/#pre-installed))
+the configuration will be detected and applied without you needing to do anything.
+If you are using an editor not on this list, you will need to install a corresponding [EditorConfig plugin](https://editorconfig.org/#editor-plugins).
+<b>If you are using vscode, you should install [this plugin](https://marketplace.visualstudio.com/items?itemName=EditorConfig.EditorConfig).</b>
+
+\subsection contributing-gpu-setup GPU Setup
+
+If you are developing on a machine with one or more CUDA GPUs, you can also run the tests that require a GPU by entering the `gpu` devshell instead of the `default` devshell:
+
+\verbatim
+$ NIXPKGS_ALLOW_UNFREE=1 nix develop .#gpu --accept-flake-config --impure
+\endverbatim
+
+and then running
+
+\verbatim
+(ff) $ proj test
+...
+\endverbatim
+
+You should see the additional GPU tests run. If you instead see a message like
+
+> `Error: ... Pass --skip-gpu-tests to skip running tests that require a GPU`
+
+Double check that you are correctly in the `gpu` devshell, not the `default` devshell.
+If you've confirmed that you are in the correct devshell and are still encountering issues, [contact us](#contact-us)
+with a detailed description of your platform and the commands you have run.
+
+\subsection contributing-nix-direnv nix-direnv (optional)
+
+If you installed nix system-wide (e.g., using [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)),
+you can use [direnv](https://direnv.net/) to automatically enter the %FlexFlow %Train development environment when you `cd` into the repository, rather
+than having to manually run `nix develop`.
+[direnv](https://direnv.net) will also automatically exit the environment when you `cd` out of the repository, and (if configured using [nix-direnv](https://github.com/nix-community/nix-direnv)) will even automatically reload the environment if the `flake.nix` file changes.
+You can find the installation instructions for direnv [here](https://direnv.net/docs/installation.html), and if you would like automatic environment reloading you can also install nix-direnv using the instructions [here](https://github.com/nix-community/nix-direnv?tab=readme-ov-file#installation).
+
+Once you have direnv (and optionally nix-direnv) installed, cd into the root of your cloned %FlexFlow %Train repository and run
+
+\verbatim
+$ echo 'use flake . --accept-flake-config' > .envrc
+\endverbatim
+
+You should see a message that the `.envrc` file you just created is blocked.
+Run the command shown in the error message (i.e., `direnv allow`), and direnv should automatically place you in the environment.
+For more information on using direnv with nix, see [here](https://github.com/direnv/direnv/wiki/Nix).
+
+\section contributing-proj Building, Testing, etc.
+
+Most operations you'll want to perform while developing %FlexFlow %Train are provided through a small python utility called [proj](https://github.com/lockshaw/proj).
+`proj` is automatically pulled in by nix when you enter the dev shell, so you should be able to run
+
+\verbatim
+(ff) $ proj -h
+\endverbatim
+
+and see the full list of operations that `proj` supports.
+`proj` commands can be run from anywhere in the repository (i.e., they do not have to be run from the root).
+To help you get started, however, a list of common command invocations is included here:
+
+- To build %FlexFlow %Train:
+  \verbatim
+  (ff) $ proj build
+  \endverbatim
+- To build and run %FlexFlow %Train tests (without a GPU):
+  \verbatim
+  (ff) $ proj test --skip-gpu-tests
+  \endverbatim
+- To build and run %FlexFlow %Train tests (with a GPU):
+  \verbatim
+  (ff) $ proj test
+  \endverbatim
+- To regenerate CMake files (necessary anytime you switch branches or modify the CMake source. If you're ever running into weird build issues, try running this and see if it fixes things):
+  \verbatim
+  (ff) $ proj cmake
+  \endverbatim
+- To format all of the %FlexFlow %Train sources files:
+  \verbatim
+  (ff) $ proj format
+  \endverbatim
+- To build the %FlexFlow %Train docs:
+  \verbatim
+  (ff) $ proj doxygen
+  \endverbatim
+  You can also add the `--browser` command to automatically open the built docs in your default browser if you are working on your local machine.
+
+\section contributing-ci Continuous Integration
+
+We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows:
+
+1. [`tests.yml`](./.github/workflows/tests.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). Also ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`).
+2. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo.
+
+GPU machines for CI are managed using [runs-on](https://runs-on.com/).
+
+\section contributing-contributing Contributing to FlexFlow
+
+We actively welcome your pull requests. Note that we may already be working on the feature/fix you're looking for, so we suggest searching through the [open issues](https://github.com/flexflow/flexflow-train/issues), [open PRs](https://github.com/flexflow/flexflow-train/pulls), and [contacting us](#contact-us) to make sure you're not duplicating existing effort!
+
+The steps for getting changes merged into %FlexFlow are relatively standard:
+
+1. [Fork the repo](https://github.com/flexflow/flexflow-train/fork) and either create a new branch based on `master`, or just modify `master` directly.
+2. If you've added code that should be tested, add tests. The process for adding tests for code under `lib` is documented [here](./lib/README.md#tests). Adding tests for other parts of the code is currently undocumented, so you will \ref contributing-contact-us "contact us" for information on how to do it.
+3. Ensure the code builds (i.e., run `proj build`).
+4. Ensure the test suite passes (i.e., run `proj test`).
+5. Format the code (i.e., run `proj format`).
+6. Create a new PR from your modified branch to the `master` branch in %FlexFlow %Train.
+   Provide a brief description of the changes you've made and link any related/closed issues.
+
+Code review is done using [Reviewable](https://reviewable.io/).
+If you haven't used Reviewable before, please read through (or at least skim) the ["Reviews" section](https://docs.reviewable.io/reviews.html) of the Reviewable documentation.
+
+\section contributing-contact-us Contact Us
+
+Either [create an issue](https://github.com/flexflow/flexflow-train/issues/new) or join the %FlexFlow [Zulip](https://flexflow.zulipchat.com/join/mtiwtwttgggnivrkb6vlakbr/) instance.
+For any reported bugs, please ensure that your description clear and has sufficient information for us to reproduce the issue.
+
+\section contributing-license License
+
+By contributing to %FlexFlow %Train, you agree that your contributions will be licensed
+under the [LICENSE](./LICENSE) file in the root directory of this source tree.
+*/
+
+<hr/>
+
+1. \anchor contributing-footnote-1 <a href="https://en.wikipedia.org/wiki/Network_File_System">Network File System</a>
+2. \anchor contributing-footnote-2 aka "dev shell"
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 3b20f35946..ef0b91d4d8 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -42,7 +42,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = FlexFlow
+PROJECT_NAME           = FlexFlow Train
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -944,7 +944,10 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = $(FF_HOME)/lib \
-                         $(FF_HOME)/bin
+                         $(FF_HOME)/bin \
+                         $(FF_HOME)/index.dox \
+                         $(FF_HOME)/contributing.dox \
+                         $(FF_HOME)/docs/sapling.dox
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/docs/SAPLING.md b/docs/sapling.dox
similarity index 95%
rename from docs/SAPLING.md
rename to docs/sapling.dox
index ad36c3e9cb..143e09c6f7 100644
--- a/docs/SAPLING.md
+++ b/docs/sapling.dox
@@ -1,4 +1,6 @@
-# Setup Guide for sapling
+/**
+
+@page sapling-setup Setup Guide for sapling
 
 1. ssh into the sapling head node.
 
@@ -82,8 +84,10 @@ NIXPKGS_ALLOW_UNFREE=1 nix develop .#gpu --accept-flake-config --impure
 (ff) $ proj test
 ...
 ```
-You should see the additional GPU tests run. If you instead see a message like 
+You should see the additional GPU tests run. If you instead see a message like
 
 > `Error: ... Pass --skip-gpu-tests to skip running tests that require a GPU`
 
-Double check that you are correctly in the `gpu` devshell, not the `default` devshell. 
+Double check that you are correctly in the `gpu` devshell, not the `default` devshell.
+
+*/
diff --git a/index.dox b/index.dox
index 73318d5315..a97476dbf3 100644
--- a/index.dox
+++ b/index.dox
@@ -1,8 +1,40 @@
 /**
 
-\mainpage FlexFlow
+\mainpage %FlexFlow %Train
 
-- \subpage bin
-- \subpage lib
+\brief %FlexFlow Train is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies.
+
+\section root-layout Project Layout
+
+The bulk of the %FlexFlow source code is stored in the following folders:
+
+- \subpage lib "": The C++ code that makes up %FlexFlow's core, split up into a number of libraries. You can find a description of each library [here](./lib/README.md).
+- \subpage bin "": Command-line interfaces for %FlexFlow and associated tools (all in C++). Generally, these are just thin wrappers that parse command-line arguments and then call out to functions defined in \ref lib for the actual processing/logic. You can find a description of each binary \ref bin "here".
+- `bindings`: Python (or any additional languages added in the future) bindings for %FlexFlow %Train
+- `docs`: Config files for documentation generators and code for generating diagrams. The actual documentation itself is included in the source directories/files as either `.md` files or inline in the language's documentation syntax (i.e., <a href="https://www.doxygen.nl/manual/index.html">Doxygen</a> for C++ and <a href="https://www.sphinx-doc.org/en/master/">Sphinx</a> for Python).
+- `cmake`: CMake configuration for building %FlexFlow %Train. Note that unless you're modifying the build configuration (i.e., adding a library, additional dependencies, etc.), you generally should use \ref contributing-proj "proj" instead of interacting with CMake directly.
+
+\section root-contributing Contributing
+
+Please let us know if you encounter any bugs or have any suggestions by <a href="https://github.com/flexflow/flexflow-train/issues">submitting an issue</a>.
+
+For instructions on how to contribute code to %FlexFlow Train, see \subpage contributing.
+
+We welcome all contributions to %FlexFlow %Train from bug fixes to new features and extensions.
+
+\section root-citations Citations
+
+- Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. [Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization](https://www.usenix.org/conference/osdi22/presentation/unger). In Proceedings of the Symposium on Operating Systems Design and Implementation (OSDI), July 2022.
+
+- Zhihao Jia, Matei Zaharia, and Alex Aiken. [Beyond Data and Model Parallelism for Deep Neural Networks](https://cs.stanford.edu/~zhihao/papers/sysml19a.pdf). In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys), April 2019.
+
+- Zhihao Jia, Sina Lin, Charles R. Qi, and Alex Aiken. [Exploring Hidden Dimensions in Parallelizing Convolutional Neural Networks](http://proceedings.mlr.press/v80/jia18a/jia18a.pdf). In Proceedings of the International Conference on Machine Learning (ICML), July 2018.
+
+\section root-team The Team
+
+%FlexFlow %Train is developed and maintained by teams at CMU, Facebook, Los Alamos National Lab, MIT, Stanford, and UCSD (alphabetically).
+
+\section root-license License
+%FlexFlow %Train uses Apache License 2.0.
 
 */
diff --git a/lib/compiler/index.dox b/lib/compiler/index.dox
index 29feab4dcc..236b42a76c 100644
--- a/lib/compiler/index.dox
+++ b/lib/compiler/index.dox
@@ -1,6 +1,6 @@
 /**
 
-\page compiler compiler/
+\page compiler compiler
 
 \brief Implements the core logic of the compiler.
 
diff --git a/lib/index.dox b/lib/index.dox
index 6925d77249..8f7f8d5586 100644
--- a/lib/index.dox
+++ b/lib/index.dox
@@ -1,7 +1,7 @@
 namespace FlexFlow {
 /**
 
-\page lib lib/
+\page lib lib
 
 This directory contains the core C++ code that underlies %FlexFlow, organized into the following libraries:
 
diff --git a/lib/kernels/index.dox b/lib/kernels/index.dox
index 6465e11307..085fa513e7 100644
--- a/lib/kernels/index.dox
+++ b/lib/kernels/index.dox
@@ -1,6 +1,6 @@
 /**
 
-\page kernels kernels/
+\page kernels kernels
 
 \brief %CPU and %GPU implementations of the operators, for use in the runtime and in operator profiling.
 
diff --git a/lib/op-attrs/include/op-attrs/ops/index.dox b/lib/op-attrs/include/op-attrs/ops/index.dox
index e8c9f3b31e..669e9aa027 100644
--- a/lib/op-attrs/include/op-attrs/ops/index.dox
+++ b/lib/op-attrs/include/op-attrs/ops/index.dox
@@ -1,7 +1,7 @@
 namespace FlexFlow {
 /**
 
-\page op-attrs-ops op-attrs/ops/
+\page op-attrs-ops op-attrs/ops
 
 \brief Contains the compiler-side definitions of the operators.
 
diff --git a/lib/op-attrs/index.dox b/lib/op-attrs/index.dox
index 677c01ef40..51bfd31db3 100644
--- a/lib/op-attrs/index.dox
+++ b/lib/op-attrs/index.dox
@@ -1,6 +1,6 @@
 /**
 
-\page op-attrs op-attrs/
+\page op-attrs op-attrs
 
 \brief Contains the compiler-side definition of all of the operators and associated functions for reasoning about their behavior, as well as the fundamental concepts needed to represent them.
 
diff --git a/lib/pcg/include/pcg/file_format/v1/index.dox b/lib/pcg/include/pcg/file_format/v1/index.dox
index fba7f69017..e6d0d4be4f 100644
--- a/lib/pcg/include/pcg/file_format/v1/index.dox
+++ b/lib/pcg/include/pcg/file_format/v1/index.dox
@@ -1,5 +1,5 @@
 /**
 
-@page file-format pcg/file_format/v1/
+@page file-format pcg/file_format/v1
 
 */
diff --git a/lib/pcg/index.dox b/lib/pcg/index.dox
index 55b478edf0..22e5e23903 100644
--- a/lib/pcg/index.dox
+++ b/lib/pcg/index.dox
@@ -1,15 +1,15 @@
 namespace FlexFlow{
 /**
 
-\page pcg pcg/
+\page pcg pcg
 
 @brief Defines the top-level datastructures (ComputationGraph, ParallelComputationGraph, and MappedParallelComputationGraph) and their serialization formats, along with some helper interfaces for constructing and manipulating them.
 
 \section pcg-datastructures Key Datastructures
 
 - \ref ComputationGraph "": aka CG
-- \ref FlexFlow::ParallelComputationGraph "ParallelComputationGraph": aka PCG
-- \ref FlexFlow::MappedParallelComputationGraph "MappedParallelComputationGraph": aka MPCG
+- \ref ParallelComputationGraph "": aka PCG
+- \ref MappedParallelComputationGraph "": aka MPCG
 
 \section serialization-formats Serialization
 
@@ -17,9 +17,8 @@ namespace FlexFlow{
 
 \section pcg-helpers Helper Functionality
 
-- \ref FlexFlow::ComputationGraphBuilder "ComputationGraphBuilder"
-- \ref FlexFlow::ParallelComputationGraphBuilder "ParallelComputationGraphBuilder"
-
+- \ref ComputationGraphBuilder ""
+- \ref ParallelComputationGraphBuilder ""
 
 */
 }
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
index 07a324f973..7b919edda7 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
@@ -14,7 +14,7 @@ void controller_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Dispatches the \ref controller task. Packages up the provided \ref std::function and
+ * \brief Dispatches the \ref term-controller task. Packages up the provided \ref std::function and
  * passes it along to \ref controller_task_body.
  */
 Realm::Event
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index e413f6da31..7853444bfe 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -23,22 +23,22 @@ namespace FlexFlow {
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * @brief Launches the task for a DynamicNodeInvocation using realm.
+ * \brief Launches the task for a DynamicNodeInvocation using realm.
  *
  * The task launch process functions a bit differently to that used in the
  * previous FlexFlow codebase. Rather than having a function registered with
  * realm/legion for every task_id_t, we now have only a few functions
- * registered: @ref op_task_body, @ref device_handle_init_task_body,
- * @ref device_state_init_return_task_body, and @ref controller_task_body (see
- * @ref register_all_tasks for where this list comes from), and in fact only
- * @ref op_task_body is launched by @ref spawn_op_task. Each of these registered
+ * registered: \ref op_task_body, \ref device_handle_init_task_body,
+ * \ref device_state_init_return_task_body, and \ref controller_task_body (see
+ * \ref register_all_tasks for where this list comes from), and in fact only
+ * \ref op_task_body is launched by \ref spawn_op_task. Each of these registered
  * tasks use the serialized arguments sent to them to dispatch to the correct
  * implementatin in task-spec: for example, if we are trying to launch the task
- * for a Conv2d operator, this function will actually dispatch a call to @ref
- * op_task_body with a serialized OpTaskArgs as an argument, and then @ref
+ * for a Conv2d operator, this function will actually dispatch a call to \ref
+ * op_task_body with a serialized OpTaskArgs as an argument, and then \ref
  * op_task_body will deserialize the argument, determine that we are trying to
- * launch the forward pass of Conv2d, use @ref execute_dynamic_node_invocation
- * (which then uses @ref call_fwd_task_impl) to actually call the function in
+ * launch the forward pass of Conv2d, use \ref execute_dynamic_node_invocation
+ * (which then uses \ref call_fwd_task_impl) to actually call the function in
  * lib/task-spec/src/task-spec/ops/impl/conv_2d.cc
  *
  * The above also means that we don't have a separate
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/index.dox b/lib/task-spec/include/task-spec/dynamic_graph/index.dox
index 2930d916cf..e3259b5632 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/index.dox
+++ b/lib/task-spec/include/task-spec/dynamic_graph/index.dox
@@ -1,7 +1,7 @@
 namespace FlexFlow {
 /**
 
-\page task-spec-dynamic-graph task-spec/dynamic_graph/
+\page task-spec-dynamic-graph task-spec/dynamic_graph
 
 \brief Contains common code for inferring and making explicit information from a \ref MappedParallelComputationGraph, lowering it into a \ref DynamicOpenDataflowGraph that can be executed by \ref realm-execution and/or \ref local-execution.
 
diff --git a/lib/task-spec/include/task-spec/ops/index.dox b/lib/task-spec/include/task-spec/ops/index.dox
index bfbc5a60a9..6910063ecd 100644
--- a/lib/task-spec/include/task-spec/ops/index.dox
+++ b/lib/task-spec/include/task-spec/ops/index.dox
@@ -1,7 +1,7 @@
 namespace FlexFlow {
 /**
 
-\page task-spec-ops task-spec/ops/
+\page task-spec-ops task-spec/ops
 
 \brief Contains the runtime-generic operator implementations, i.e., the adapter code between the runtimes and \ref kernels.
 
diff --git a/lib/task-spec/index.dox b/lib/task-spec/index.dox
index 44d9b0a2cb..187bbe9638 100644
--- a/lib/task-spec/index.dox
+++ b/lib/task-spec/index.dox
@@ -1,7 +1,7 @@
 namespace FlexFlow {
 /**
 
-\page task-spec task-spec/
+\page task-spec task-spec
 
 \brief An intermediate layer between the compiler and the runtime. Contains code for lowering the \ref MappedParallelComputationGraph exported from the \ref compiler down to a granularity that the runtime can actually execute. Also contains the functions that translate between logical operators (i.e., \ref op-attrs-ops) and actual calls to \ref kernels.
 
diff --git a/lib/utils/include/utils/graph/README.md b/lib/utils/include/utils/graph/README.md
deleted file mode 100644
index 78fdb4b6a1..0000000000
--- a/lib/utils/include/utils/graph/README.md
+++ /dev/null
@@ -1,277 +0,0 @@
-# graph
-
-## Design Considerations
-
-FlexFlow's graph library very intentionally attempts to balance performance and ease of use.
-The graph library aims to have a very simple external interface that is highly decoupled from the underlying representations, so performance and internal implementations can be tuned and modified over time without breaking the code that uses the library.
-Because FlexFlow's graphs are not on the scale of machine memory or not so large that single traversals takes nontrivial time, the graph library intentionally avoids performance opportunities that would expose many of these performance aspects to user code.
-Of course, there are also some optimizations that simply have not been done due to time constraints: for example, algorithms currently are able to be specialized for the underlying representation being used, but this could be added without modifying the user-side interface.
-
-## Usage
-
-### Core Graph Variants
-
-There is no single type of graph. Should it be directed? Allow multiple edges between nodes? Should nodes and/or edges have information attached?
-Because there is no single answer to this question, similar to [networkx](https://networkx.org/) we provide a number of different graph variants.
-At their core, they are as follows:
-
-- `UndirectedGraph`: at most one edge allowed between every pair of nodes, edges are undirected.
-- `DiGraph`: at most one edge allowed between every ordered pair of nodes, edges are directed (i.e., have a source node and a destination node)
-- `MultiDiGraph`: arbitrary numbers of directed edges allowed between every pair of nodes.
-- `DataflowGraph`: used to model computation graphs. See the [DataflowGraph](#dataflowgraph) section for a detailed explanation.
-
-Examples of the different graph variants are shown below.
-
-Example of `UndirectedGraph`:
-```mermaid
-flowchart TD
-    A(" ")
-    B(" ")
-    C(" ")
-    D(" ")
-    E(" ")
-
-    A --- B
-    A --- C
-    B --- C
-    B --- B
-    D --- B
-```
-
-Example of `DiGraph`:
-```mermaid
-flowchart TD
-    A(" ")
-    B(" ")
-    C(" ")
-    D(" ")
-    E(" ")
-    F(" ")
-
-    A --> F
-    B --> E
-    B --> C
-    B --> B
-    D --> B
-    C --> D
-```
-
-Example of `MultiDiGraph`:
-```mermaid
-flowchart TD
-    A
-    B
-    C
-    D
-    E
-    F
-
-    A --> B
-    B --> C
-    C --> D
-    D --> A
-    B --> E
-    E --> B
-    D --> A
-    A --> E
-    D --> D
-    E --> E
-```
-
-Note that the node names are completely arbitrary: they have no apparent ordering or other meaning besides representing the topology of the graph.
-This is the case with all of the 4 core graph classes.
-Nodes are of type `Node`, and from a user perspective are simply opaque handles, and source and destination indices should similarly be considered opaque from a user point of view.
-In addition, nodes should only be used in the context of their graph, so comparing or checking equality of nodes between different graphs (even of the same type) is undefined behavior[^1].
-
-All three core graph variants allow insertion and deletion of both edges and nodes.
-To add a node to an `UndirectedGraph g`, simply call `g.add_node()`, which will return a `Node` object.
-For semantics closer to `networkx`'s method of adding nodes, `g.add_node_unsafe(my_node)` can be used. This is useful when constructing a modified copy of an existing graph (given that it maintains node bijection), though it is not generally recommended.
-The interface for node addition is identical for `DiGraph` and `MultiDiGraph`.
-To add an edge between two nodes `Node n1` and `Node n2` to an `UndirectedGraph g`, call `g.add_edge({n1, n2})`.
-In `UndirectedGraph` the order of the arguments of `add_edge` doesn't matter as edges are undirected, but the order does matter for `DiGraph`, `MultiDiGraph` and `DataflowGraph`.
-
-The last paragraph covered the base API used to write to graphs, but we also want to be able to read from graphs.
-Reading from graphs is implemented with the `query_nodes` and `query_edges` methods, which can be thought of as executing a database query over the nodes and edges of the target graph, respectively (where queries are restricted to an incredibly simple set of operations).
-The argument to `query_nodes` is a `NodeQuery` (which is simply a set of `Node`s).
-`query_nodes` then returns the intersection of the nodes in the graph and the nodes in the query.
-The set of nodes in the query is actually an `optional`, so `nullopt` could also be passed, which would simply retrieve all nodes from the target graph (essentially `nullopt` acts as the set of all nodes that could ever exist).
-`query_edges` functions similarly, but as with `add_edge` its behavior is differs slightly between the three graph variants.
-`UndirectedGraph::query_edges` simply takes an optional set of nodes and returns all edges that touch any of those nodes.
-`DiGraph::query_edges` allows separate sets for source and destination nodes, and `MultiDiGraph::query_edges` adds the ability to filter by source and destination indices as well.
-
-In practice you will rarely ever use `query_nodes` and `query_edges` as the graph library provides a large number of algorithms that do that work for you, but it can be helpful to understand this base layer if you ever need to implement your own algorithms.
-The layer users will most commonly interact with is the interface provided within either the `algorithms.h` header files or the `algorithms` folders, present in their respective graph class folders.
-They provide a large number of pre-implemented algorithms on graphs, ranging from as simple as `get_nodes` to as complex as `get_transitive_reduction` and `get_dominators`.
-Note that, due to the internal virtual inheritance structure, some functions for more privitive classes can be employed by the derived classes. (For example, `get_nodes` present in `node/algorithms.h` can be used by `DiGraph`).
-You may notice that the most of algorithms present take as arguments not `UndirectedGraph`, `DiGraph`, and `MultiDiGraph`, but rather `UndirectedGraphView`, `DiGraphView`, and `MultiDiGraphView`.
-These `GraphView` objects represent read-only (i.e., immutable) graphs.
-Similar to C++'s `const` semantics, `Graph`s can be coerced[^2] to `GraphView`s but not the other way around.
-To transform a `GraphView` to a `Graph`, we can perform an explicit copy with `materialize_view`.
-Both `Graph` and `GraphView` types follow normal value semantics.
-This may seem wasteful (oftentimes graphs are large objects that are passed around via reference to avoid making additional copies), but the `Graph` and `GraphView` types internally implement copy-on-write optimizations to only perform the minimum number of actual copies while maintaining immutability and lifetime safety (if you allocate a `DiGraph` use for example `get_subgraph` to get a `DiGraphView` representing a part of this graph, modifications to the underlying `DiGraph` will not be mirrored in the `DiGraphView` and the `DiGraphView` will remain valid even after the base `DiGraph` leaves scope.
-
-At this point, however, we still have not discussed how to create a graph.
-The user-facing graph interface is intentionally separated from the underlying graph representations, so representations can be changed without requiring any user-side code modifications besides the choice of which implementation to use.
-For example, to construct a `DiGraph` which internally uses a representation such as `AdjacencyDiGraph` we do the following:
-```cpp
-DiGraph g = DiGraph::create<AdjacencyDiGraph>();
-```
-Generally users will use underlying representations provided by the graph library, but advanced users can create their own implementations (see the [Internals](#internals) section).
-
-[^1]: At some point we will likely add actual runtime checks on this, but for now we rely on the user not to mess up. Currently the implementation will keep going silently until the incorrectness grows so large that something breaks/crashes.
-[^2]: See <https://en.wikipedia.org/wiki/Type_conversion> if you're not familiar with the term _type coercion_
-
-### DataflowGraph
-
-The primary abstraction for representing computation graphs / task graphs is the `DataflowGraph` interface (along with its variants, `OpenDataflowGraph`, `LabelleledDataflowGraph` and `OpenLabelleledDataflowGraph`).
-At a high level, nodes represent multivariate functions (from tuples of inputs to tuple of outputs), while edges represent value uses of such functions.
-
-`DataflowGraph` is similar to `MultiDiGraph`, but with the following important differences:
-  - The edges entering, exiting a given nodes have a well-defined order.
-  - The outputs of a given node also have a well-defined order.
-  - `DataflowGraph`s are directed acyclic graphs. This is enforced by the interface used to construct them, since a node can only be added to the graph after all of its predecessor nodes have already been added.
-
-The main components of `DataflowGraph` are as follows:
-- `DataflowInput`: used to denote an entry in the ordered sequence of incoming dependencies (arguments) of a given node (operator).
-- `DataflowOutput`: used to denote an entry in the ordered sequence of outgoing results (value uses) from a given node (operator).
-- `DataflowEdge`: wrapper around a `DataflowInput`, `DataflowOutput` pair between 2 nodes.
-- `NodeAddedResult`: returned upon adding a new node. Contains the newly generated `Node` and the vector of `DataflowOutput`s for the given node.
-
-`DataflowGraph`s are constructed as follows:
-
-```cpp
-    auto g = DataflowGraph::create<UnorderedSetDataflowGraph>();
-
-    // Node with no inputs and 2 outputs
-    NodeAddedResult n1_result = g.add_node({}, 2);
-    Node n1 = n1_result.node;
-    DataflowOutput n1_o1 = n1_result.outputs[0];
-    DataflowOutput n1_o2 = n1_result.outputs[1];
-
-    // Node with 2 inputs and 1 output
-    NodeAddedResult n2_result = g.add_node({n1_o1, n1_o2}, 1);
-    Node n2 = n2_result.node;
-    DataflowOutput n2_o1 = n2_result.outputs[0];
-
-    // Node with 1 input and 2 outputs
-    NodeAddedResult n3_result = g.add_node({n1_o2}, 1);
-    Node n3 = n3_result.node;
-    DataflowOutput n3_o1 = n3_result.outputs[0];
-    DataflowOutput n3_o2 = n3_result.outputs[1];
-
-    // Node with 2 inputs and 1 output
-    NodeAddedResult n4_result = g.add_node({n2_o1, n3_o1}, 1);
-    Node n4 = n4_result.node;
-    DataflowOutput n4_o1 = n4_result.outputs[0];
-```
-
-which generates the following graph
-
-```mermaid
-flowchart TD
-    subgraph Node1[ ]
-        direction TB
-        N1Process[n1]
-        n1_o1((n1_o1))
-        n1_o2((n1_o2))
-        N1Process --> n1_o1
-        N1Process --> n1_o2
-    end
-
-    subgraph Node2[ ]
-        direction TB
-        n2_i1((n2_i1))
-        n2_i2((n2_i2))
-        N2Process[n2]
-        n2_o1((o1))
-        n2_i1 --> N2Process
-        n2_i2 --> N2Process
-        N2Process --> n2_o1
-    end
-
-    subgraph Node3[ ]
-        direction TB
-        n3_i1((n3_i1))
-        N3Process[n3]
-        n3_o1((n3_o1))
-        n3_o2((n3_o2))
-        n3_i1 --> N3Process
-        N3Process --> n3_o1
-        N3Process --> n3_o2
-    end
-
-    subgraph Node4[ ]
-        direction TB
-        n4_i1((n4_i1))
-        n4_i2((n4_i2))
-        N4Process[n4]
-        n4_o1((n4_o1))
-        n4_i1 --> N4Process
-        n4_i2 --> N4Process
-        N4Process --> n4_o1
-    end
-
-    n1_o1 --> n2_i1
-    n1_o2 --> n2_i2
-    n1_o2 --> n3_i1
-    n2_o1 --> n4_i1
-    n3_o1 --> n4_i2
-```
-
-
-### Open Dataflow Variant
-
-`Open` should be interpreted in the topological sense: that is, a graph that contains some edges where one of the edge's 2 nodes is not present in the graph itself.
-This graph class is particularly useful for processing a sub-graph of a given graph while still maintaining information regarding the edges that cross the cut.
-`DataflowGraphInput` is used to represent the open (incoming) inputs to the graph. Note that, unlike `DataFlowInput`, `DataflowGraphInput`s are unordered (given that they are inputs to possibly several different nodes within the graph).
-
-### Labelled Dataflow Variant
-
-As nice as all of the above is, graphs without labels are mostly useless--in practice, nodes and edges represent some other system and the properties of that system (or at least a way to map the result of graph algorithms back to the underlying system) are necessary.
-Thus, FlexFlow's graph library provides the ability to add labels to `DataflowGraph`, through the `LabelleledDataflowGraph` and `OpenLabelleledDataflowGraph`, which allow users to label different components of the graph.
-- `LabelledDataflowGraph` allows for labelling of `Node`s and `DataflowOutput`s.
-- `OpenLabelledDataflowGraph` allows for labelling of `Node`s and `OpenDataflowValue`s, which is a variant describing both `DataflowOutput`s and `DataflowGraphInput`s.
-
-While the interfaces of these graphs differ slightly from the core graph variants, they still have the corresponding `add_node` methods, and `query_nodes`/`query_edges` methods. (Note that there is no `add_edge` method since, for `DataflowGraph`, edges are implicitly added when we add a node and specify its predecessors)
-Note that all of the labelled graph types require that each element of the labelled types have a label, which is enforced via the interfaces they provide.
-Partial labelling can be implement via wrapping the label type in `optional`.
-Interacting with `Node` and `Edge` objects is still necessary to use the labelled graph types: intuitively the labelled graph types can be thought of as a pair of a core graph variant and a hash map the maps nodes/edges to labels.
-As such, the labelled graph types provide the typical `at` method (as on `std::unordered_map`[^3]) and can be coerced to their underlying core graph variants.
-
-[^3]: `operator[]` currently is not present because all nodes must have labels and we don't require label types to be default constructible, though some simple template programming could probably add `operator[]` support in the cases where the label types _are_ default constructible.
-
-
-## Internals
-
-Most of the major graph classes in the library come in sets of 4. For a given class `GlassName` we have:
-1. `ClassName`
-2. `ClassNameView`
-3. `IClassName`
-4. `IClassNameView`
-
-General rules which apply to most classes:
-- `ClassName` (virtually) inherits from `ClassNameView`. Similarly, `IClassName` (virtually) inherits from `IClassNameView`.
-- `ClassName` has, as a member variable, a `cow_ptr` of type `IClassName`. Same holds for `ClassNameView`.
-Thus, the bulk of the inheritance that actually extends functionality is present among `IClassNameView` classes.
-
-
-### cow_ptr and Interfaces
-
-The reason for the existence of the `View` variants has been explained in previous sections.
-The existence of the `I(nterface)` variants stems from C++'s approach to modeling polymorphism.
-
-C++ polymorphism is achieved at runtime through the use of [virtual functions](https://www.learncpp.com/cpp-tutorial/virtual-functions/), which allow for a single function defined on some superclass to also work correctly on its subclasses.
-
-To create objects with polymorphic behaviour, we use the following syntax:
-`BaseClass* obj = new DerivedClass(); //or alternatives such as std::shared_ptr<BaseClass> obj = std::make_shared<DerivedClass>();`
-Any call to `obj`'s member functions are resolved at runtime (dynamic binding), with C++ calling the most derived implementation of the function.
-
-While this pattern works nicely, the way instantiation is done leaves the burden of memory management on the user.
-To address this, graph classes store a `cow_ptr` as a member variable, which point to instances of type equal to their corresponding interface class.
-
-All member functions present in `ClassName` and `ClassNameView` delegate their calls to their corresponding interface classes (which implement the actual logic), meaning that these classes essentially act as wrappers to their interface counterparts.
-
-### Virtual Inheritance
-Due to the complexity of the graph library, diamond-style inheritance patterns emerge.
-In the case of a diamond inheritance pattern, C++ will instantiate multiple copies of the base class whenever we instantiate a derived class.
-To address this issue, we employ <a href="https://en.wikipedia.org/wiki/Virtual_inheritance">virtual inheritance</a>, which removes the ambiguity associated with the multiple copies.
diff --git a/lib/utils/include/utils/graph/index.dox b/lib/utils/include/utils/graph/index.dox
index 68a6d05fd1..dcd1004761 100644
--- a/lib/utils/include/utils/graph/index.dox
+++ b/lib/utils/include/utils/graph/index.dox
@@ -1,3 +1,4 @@
+namespace FlexFlow {
 /**
 
 \page utils-graph utils/graph
@@ -15,15 +16,202 @@ Of course, there are also some optimizations that simply have not been done due
 
 \subsection core-graph-variants Core Graph Variants
 
+There is no single type of graph. Should it be directed? Allow multiple edges between nodes? Should nodes and/or edges have information attached?
+Because there is no single answer to this question, similar to <a href="https://networkx.org/">networkx</a> we provide a number of different graph variants.
+At their core, they are as follows:
+
+- \ref UndirectedGraph "": at most one edge allowed between every pair of nodes, edges are undirected.
+- \ref DiGraph "": at most one edge allowed between every ordered pair of nodes, edges are directed (i.e., have a source node and a destination node)
+- \ref MultiDiGraph "": arbitrary numbers of directed edges allowed between every pair of nodes.
+- \ref DataflowGraph "": used to model computation graphs. See the @ref dataflow-graph section for a detailed explanation.
+
+Examples of the different graph variants are shown below.
+
+Example of \ref UndirectedGraph "":
+\dot
+graph {
+    A [label=""];
+    B [label=""];
+    C [label=""];
+    D [label=""];
+    E [label=""];
+
+    A -- B
+    A -- C
+    B -- C
+    B -- B
+    D -- B
+}
+\enddot
+
+Example of \ref DiGraph "":
+\dot
+digraph {
+    A [label=""];
+    B [label=""];
+    C [label=""];
+    D [label=""];
+    E [label=""];
+    F [label=""];
+
+    A -> F
+    B -> E
+    B -> C
+    B -> B
+    D -> B
+    C -> D
+}
+\enddot
+
+Example of \ref MultiDiGraph "":
+\dot
+digraph {
+    A [label=""];
+    B [label=""];
+    C [label=""];
+    D [label=""];
+    E [label=""];
+    F [label=""];
+
+    A -> B
+    B -> C
+    C -> D
+    D -> A
+    B -> E
+    E -> B
+    D -> A
+    A -> E
+    D -> D
+    E -> E
+}
+\enddot
+
+Note that the node names are completely arbitrary: they have no apparent ordering or other meaning besides representing the topology of the graph.
+This is the case with all of the 4 core graph classes.
+Nodes are of type \ref Node, and from a user perspective are simply opaque handles, and source and destination indices should similarly be considered opaque from a user point of view.
+In addition, nodes should only be used in the context of their graph, so comparing or checking equality of nodes between different graphs (even of the same type) is undefined behavior \ref graph-footnote-1 "[1]".
+
+All three core graph variants allow insertion and deletion of both edges and nodes.
+To add a node to an \ref UndirectedGraph \c g, simply call <tt>g.add_node()</tt>, which will return a \ref Node object.
+For semantics closer to <tt>networkx</tt>'s method of adding nodes, <tt>g.add_node_unsafe(my_node)</tt> can be used. This is useful when constructing a modified copy of an existing graph (given that it maintains node bijection), though it is not generally recommended.
+The interface for node addition is identical for \ref DiGraph and \ref MultiDiGraph.
+To add an edge between two nodes \c n1 and \c n2 to an \ref UndirectedGraph \c g, call <tt>g.add_edge({n1, n2})</tt>.
+In \ref UndirectedGraph the order of the arguments of \ref UndirectedGraph::add_edge "add_edge" doesn't matter as edges are undirected, but the order does matter for \ref DiGraph, \ref MultiDiGraph and \ref DataflowGraph.
+
+The last paragraph covered the base API used to write to graphs, but we also want to be able to read from graphs.
+Reading from graphs is implemented with the \c query_nodes and \c query_edges methods, which can be thought of as executing a database query over the nodes and edges of the target graph, respectively (where queries are restricted to an incredibly simple set of operations).
+The argument to \c query_nodes is a \ref NodeQuery (which is simply a set of \ref Node ""s).
+\c query_nodes then returns the intersection of the nodes in the graph and the nodes in the query.
+The set of nodes in the query is actually a \ref std::optional, so \ref std::nullopt could also be passed, which would simply retrieve all nodes from the target graph (essentially \ref std::nullopt acts as the set of all nodes that could ever exist).
+\c query_edges functions similarly, but as with \c add_edge its behavior is differs slightly between the three graph variants.
+\ref UndirectedGraph::query_edges simply takes an optional set of nodes and returns all edges that touch any of those nodes.
+\ref DiGraph::query_edges allows separate sets for source and destination nodes, and \ref MultiDiGraph::query_edges adds the ability to filter by source and destination indices as well.
+
+In practice you will rarely ever use \c query_nodes and \c query_edges as the graph library provides a large number of algorithms that do that work for you, but it can be helpful to understand this base layer if you ever need to implement your own algorithms.
+The layer users will most commonly interact with is the interface provided within either the \c algorithms.h header files or the \c algorithms folders, present in their respective graph class folders.
+They provide a large number of pre-implemented algorithms on graphs, ranging from as simple as \ref get_nodes to as complex as \ref get_transitive_reduction and \ref get_dominators.
+Note that, due to the internal virtual inheritance structure, some functions for more privitive classes can be employed by the derived classes. (For example, `get_nodes` present in `node/algorithms.h` can be used by \ref DiGraph).
+You may notice that the most of algorithms present take as arguments not \ref UndirectedGraph, \ref DiGraph, and \ref MultiDiGraph, but rather \ref UndirectedGraphView, \ref DiGraphView, and \ref MultiDiGraphView.
+These \ref GraphView objects represent read-only (i.e., immutable) graphs.
+Similar to C++'s \c const semantics, \ref Graph ""s can be coerced \ref graph-footnote-2 "[2]" to \ref GraphView ""s but not the other way around.
+To transform a \ref GraphView to a \ref Graph, we can perform an explicit copy with \ref materialize_view.
+Both \ref Graph and \ref GraphView types follow normal value semantics.
+This may seem wasteful (oftentimes graphs are large objects that are passed around via reference to avoid making additional copies), but the \ref Graph and \ref GraphView types internally implement copy-on-write optimizations to only perform the minimum number of actual copies while maintaining immutability and lifetime safety (if you allocate a \ref DiGraph use for example \ref "get_subgraph(DiGraphView const &, std::unordered_set<Node> const *)" "get_subgraph" to get a \ref DiGraphView representing a part of this graph, modifications to the underlying \ref DiGraph will not be mirrored in the \ref DiGraphView and the \ref DiGraphView will remain valid even after the base \ref DiGraph leaves scope.
+
+At this point, however, we still have not discussed how to create a graph.
+The user-facing graph interface is intentionally separated from the underlying graph representations, so representations can be changed without requiring any user-side code modifications besides the choice of which implementation to use.
+For example, to construct a \ref DiGraph which internally uses a representation such as \ref AdjacencyDiGraph we do the following:
+
+\code
+DiGraph g = DiGraph::create<AdjacencyDiGraph>();
+\endcode
+
+Generally users will use underlying representations provided by the graph library, but advanced users can create their own implementations (see the \ref graph-internals section).
+
 \subsection dataflow-graph DataflowGraph
 
+The primary abstraction for representing computation graphs / task graphs is the \ref DataflowGraph interface (along with its variants, \ref OpenDataflowGraph, \ref LabelleledDataflowGraph and \ref OpenLabelleledDataflowGraph).
+At a high level, nodes represent multivariate functions (from tuples of inputs to tuple of outputs), while edges represent value uses of such functions.
+
+\ref DataflowGraph is similar to \ref MultiDiGraph, but with the following important differences:
+  - The edges entering, exiting a given nodes have a well-defined order.
+  - The outputs of a given node also have a well-defined order.
+  - \ref DataflowGraph ""s are directed acyclic graphs. This is enforced by the interface used to construct them, since a node can only be added to the graph after all of its predecessor nodes have already been added.
+
+The main components of \ref DataflowGraph are as follows:
+- \ref DataflowInput: used to denote an entry in the ordered sequence of incoming dependencies (arguments) of a given node (operator).
+- \ref DataflowOutput: used to denote an entry in the ordered sequence of outgoing results (value uses) from a given node (operator).
+- \ref DataflowEdge: wrapper around a \ref DataflowInput, \ref DataflowOutput pair between 2 nodes.
+- \ref NodeAddedResult "": returned upon adding a new node. Contains the newly generated \ref Node and the \ref std::vector of \ref DataflowOutput ""s for the given node.
+
+\ref DataflowGraph ""s are constructed as follows:
+
+\code
+    auto g = DataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    // Node with no inputs and 2 outputs
+    NodeAddedResult n1_result = g.add_node({}, 2);
+    Node n1 = n1_result.node;
+    DataflowOutput n1_o1 = n1_result.outputs[0];
+    DataflowOutput n1_o2 = n1_result.outputs[1];
+
+    // Node with 2 inputs and 1 output
+    NodeAddedResult n2_result = g.add_node({n1_o1, n1_o2}, 1);
+    Node n2 = n2_result.node;
+    DataflowOutput n2_o1 = n2_result.outputs[0];
+
+    // Node with 1 input and 2 outputs
+    NodeAddedResult n3_result = g.add_node({n1_o2}, 1);
+    Node n3 = n3_result.node;
+    DataflowOutput n3_o1 = n3_result.outputs[0];
+    DataflowOutput n3_o2 = n3_result.outputs[1];
+
+    // Node with 2 inputs and 1 output
+    NodeAddedResult n4_result = g.add_node({n2_o1, n3_o1}, 1);
+    Node n4 = n4_result.node;
+    DataflowOutput n4_o1 = n4_result.outputs[0];
+\endcode
+
+which generates the following graph
+
+\dot
+digraph {
+    node [shape=record];
+    n1 [label="{|{<o1>|<o2>}}"];
+    n2 [label="{{<i1>|<i2>}||{<o1>}}"];
+    n3 [label="{{<i1>}||{<o1>|<o2>}}"];
+    n4 [label="{{<i1>|<i2>}||{<o1>}}"];
+
+    n1:o1 -> n2:i1
+    n1:o2 -> n2:i2
+    n1:o2 -> n3:i1:n;
+    n2:o1 -> n4:i1
+    n3:o1 -> n4:i2
+}
+\enddot
+
 \subsection open-dataflow-variant Open Dataflow Variant
 
+"Open" should be interpreted in the topological sense: that is, a graph that contains some edges where one of the edge's 2 nodes is not present in the graph itself.
+This graph class is particularly useful for processing a sub-graph of a given graph while still maintaining information regarding the edges that cross the cut.
+\ref DataflowGraphInput is used to represent the open (incoming) inputs to the graph. Note that, unlike \ref DataFlowInput, \ref DataflowGraphInput ""s are unordered (given that they are inputs to possibly several different nodes within the graph).
+
 \subsection labelled-dataflow-variant Labelled Dataflow Variant
 
+As nice as all of the above is, graphs without labels are mostly useless--in practice, nodes and edges represent some other system and the properties of that system (or at least a way to map the result of graph algorithms back to the underlying system) are necessary.
+Thus, FlexFlow's graph library provides the ability to add labels to \ref DataflowGraph, through the \ref LabelleledDataflowGraph and \ref OpenLabelleledDataflowGraph, which allow users to label different components of the graph.
+- \ref LabelledDataflowGraph allows for labelling of \ref Node ""s and \ref DataflowOutput ""s.
+- \ref OpenLabelledDataflowGraph allows for labelling of \ref Node ""s and \ref OpenDataflowValue ""s, which is a variant describing both \ref DataflowOutput ""s and \ref DataflowGraphInput ""s.
+
+While the interfaces of these graphs differ slightly from the core graph variants, they still have the corresponding \ref LabelledDataflowGraph::add_node methods, and \ref LabelledDataflowGraph::query_nodes / \ref LabelledDataflowGraph::query_edges methods. (Note that there is no \c add_edge method since, for \ref DataflowGraph, edges are implicitly added when we add a node and specify its predecessors)
+Note that all of the labelled graph types require that each element of the labelled types have a label, which is enforced via the interfaces they provide.
+Partial labelling can be implement via wrapping the label type in \ref std::optional.
+Interacting with \c Node and \c Edge objects is still necessary to use the labelled graph types: intuitively the labelled graph types can be thought of as a pair of a core graph variant and a hash map the maps nodes/edges to labels.
+As such, the labelled graph types provide the typical \ref LabelledDataflowGraph::at method (as on \ref std::unordered_map \ref graph-footnote-3 "[3]") and can be coerced to their underlying core graph variants.
+
 \section graph-internals Internals
 
-\subsection cow-ptr-and-interfaces cow_ptr and Interfaces
+\subsection cow-ptr-and-interfaces cow_ptr_t and Interfaces
 
 The reason for the existence of the \c View variants has been explained in previous sections.
 The existence of the \c "I(nterface)" variants stems from C++'s approach to modeling polymorphism.
@@ -31,13 +219,15 @@ The existence of the \c "I(nterface)" variants stems from C++'s approach to mode
 C++ polymorphism is achieved at runtime through the use of <a href="https://www.learncpp.com/cpp-tutorial/virtual-functions/">virtual functions</a>, which allow for a single function defined on some superclass to also work correctly on its subclasses.
 
 To create objects with polymorphic behaviour, we use the following syntax:
+
 \code
 BaseClass* obj = new DerivedClass(); //or alternatives such as std::shared_ptr<BaseClass> obj = std::make_shared<DerivedClass>();
 \endcode
+
 Any call to \c obj 's member functions are resolved at runtime (dynamic binding), with C++ calling the most derived implementation of the function.
 
 While this pattern works nicely, the way instantiation is done leaves the burden of memory management on the user.
-To address this, graph classes store a \ref cow_ptr as a member variable, which point to instances of type equal to their corresponding interface class.
+To address this, graph classes store a \ref cow_ptr_t as a member variable, which point to instances of type equal to their corresponding interface class.
 
 All member functions present in \c ClassName and \c ClassNameView delegate their calls to their corresponding interface classes (which implement the actual logic), meaning that these classes essentially act as wrappers to their interface counterparts.
 
@@ -47,4 +237,11 @@ Due to the complexity of the graph library, diamond-style inheritance patterns e
 In the case of a diamond inheritance pattern, C++ will instantiate multiple copies of the base class whenever we instantiate a derived class.
 To address this issue, we employ <a href="https://en.wikipedia.org/wiki/Virtual_inheritance">virtual inheritance</a>, which removes the ambiguity associated with the multiple copies.
 
+<hr/>
+
+1. \anchor graph-footnote-1 At some point we will likely add actual runtime checks on this, but for now we rely on the user not to mess up. Currently the implementation will keep going silently until the incorrectness grows so large that something breaks/crashes.
+2. \anchor graph-footnote-2 See <a href="https://en.wikipedia.org/wiki/Type_conversion">here</a> if you're not familiar with the term <em>type coercion</em>.
+3. \anchor graph-footnote-3 <tt>operator[]</tt> currently is not present because all nodes must have labels and we don't require label types to be default constructible, though some simple template programming could probably add <tt>operator[]</tt> support in the cases where the label types <em>are</em> default constructible.
+
 */
+}

From 56a51babcad2183f2e3b79532f6ce22bfc135eef Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Mon, 9 Mar 2026 20:57:04 -0700
Subject: [PATCH 093/113] Add docstrings for DistributedDeviceHandle et al

---
 .../realm-execution/distributed_device_handle.h       | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
index 1173d75b27..774d671ea3 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
@@ -8,6 +8,10 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief Tracks the \ref device_handle_t (i.e., FFHandle) for each GPU, both local
+ * and remote. GPUs here are represented by \ref Realm::Processor ""s.
+ */
 struct DistributedDeviceHandle {
 public:
   DistributedDeviceHandle() = delete;
@@ -24,6 +28,13 @@ struct DistributedDeviceHandle {
       handles;
 };
 
+/**
+ * \brief Launch tasks (using \ref spawn_device_handle_init_task) to create the
+ * \ref device_handle_t ""s for each GPU and package the results into a
+ * DistributedDeviceHandle.
+ *
+ * \relates DistributedDeviceHandle
+ */
 DistributedDeviceHandle create_distributed_device_handle(
     RealmContext &ctx,
     size_t workSpaceSize,

From 1879cc65ad7888a02d46e09fd081e1dea632eac7 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Mon, 9 Mar 2026 21:09:52 -0700
Subject: [PATCH 094/113] Add docstring for PerDeviceOpStateBacking et al

---
 .../realm-execution/distributed_device_handle.h       |  8 ++++----
 ... distributed_per_device_op_state_initialization.h} | 11 +++++++++--
 .../per_device_op_state_backing.dtg.toml              |  8 ++++++++
 ... distributed_per_device_op_state_initalization.cc} |  2 +-
 4 files changed, 22 insertions(+), 7 deletions(-)
 rename lib/realm-execution/include/realm-execution/{distributed_device_state_initialization.h => distributed_per_device_op_state_initialization.h} (75%)
 rename lib/realm-execution/src/realm-execution/{distributed_device_state_initialization.cc => distributed_per_device_op_state_initalization.cc} (97%)

diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
index 774d671ea3..20f170d42a 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
@@ -9,8 +9,8 @@
 namespace FlexFlow {
 
 /**
- * \brief Tracks the \ref device_handle_t (i.e., FFHandle) for each GPU, both local
- * and remote. GPUs here are represented by \ref Realm::Processor ""s.
+ * \brief Tracks the \ref device_handle_t (i.e., FFHandle) for each %GPU, both local
+ * and remote. %GPUs here are represented by \ref Realm::Processor ""s.
  */
 struct DistributedDeviceHandle {
 public:
@@ -29,8 +29,8 @@ struct DistributedDeviceHandle {
 };
 
 /**
- * \brief Launch tasks (using \ref spawn_device_handle_init_task) to create the
- * \ref device_handle_t ""s for each GPU and package the results into a
+ * \brief Launches tasks (using \ref spawn_ff_handle_init_task) to create
+ * the \ref device_handle_t ""s for each %GPU and packages the results into a
  * DistributedDeviceHandle.
  *
  * \relates DistributedDeviceHandle
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h
similarity index 75%
rename from lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
rename to lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h
index b26a69078e..1518a9d04b 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
+++ b/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_BACKING_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_BACKING_H
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_INITIALIZATION_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_INITIALIZATION_H
 
 #include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
@@ -12,6 +12,13 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief Launches tasks (using \ref spawn_per_device_op_state_init_task) to
+ * create the \ref PerDeviceOpState ""s for each %GPU and packages the results
+ * into a PerDeviceOpStateBacking.
+ *
+ * \relates PerDeviceOpStateBacking
+ */
 PerDeviceOpStateBacking perform_distributed_device_state_initialization(
     RealmContext &ctx,
     DynamicOpenDataflowGraph const &dg,
diff --git a/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
index 90a9d01e69..b0ba11f5b4 100644
--- a/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
@@ -2,6 +2,14 @@ namespace = "FlexFlow"
 name = "PerDeviceOpStateBacking"
 type = "struct"
 features = []
+docstring = '''
+/**
+ * \brief Maps each shard-expanded DynamicNodeInvocation to its corresponding PerDeviceOpState.
+ *
+ * PerDeviceOpStateBacking is to PerDeviceOpState as DistributedDeviceHandle is to \ref device_handle_t (i.e., FFHandle).
+ */
+'''
+
 
 includes = [
   "<unordered_map>",
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initalization.cc
similarity index 97%
rename from lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
rename to lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initalization.cc
index 5c0aff00c2..8fdc9a9784 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initalization.cc
@@ -1,4 +1,4 @@
-#include "realm-execution/distributed_device_state_initialization.h"
+#include "realm-execution/distributed_per_device_op_state_initalization.h"
 #include "local-execution/device_state_initialization.h"
 #include "realm-execution/tasks/impl/device_state_init_task.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"

From 77bf24baa0b1a5ee63390978dc08f34bf9e516b3 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Mon, 9 Mar 2026 23:41:11 -0700
Subject: [PATCH 095/113] More docs from reviewing 1629

---
 bin/CMakeLists.txt                            |  6 +--
 bin/export-model-arch/index.dox               |  2 +
 bin/index.dox                                 |  7 ++--
 bin/protobuf-to-json/index.dox                |  5 +++
 .../CMakeLists.txt                            |  0
 .../sp-ization-benchmarking}/distributions.h  |  0
 .../nasnet_bench_graph_generator.h            |  0
 .../sp-ization-benchmarking}/sample_graphs.h  |  0
 bin/sp-ization-benchmarking/index.dox         | 12 ++++++
 .../sp-ization-benchmarking}/distributions.cc |  0
 .../src/sp-ization-benchmarking/main.cc}      |  8 ++--
 bin/substitution-to-dot/index.dox             |  5 +++
 contributing.dox                              |  2 +-
 docs/doxygen/Doxyfile                         |  3 +-
 docs/realm-api.dox                            |  7 ++++
 index.dox                                     |  4 +-
 .../dynamic_tensor_accessor_from_instance.h   |  3 +-
 .../realm-execution/instance_allocation.h     | 16 +++++++-
 .../include/realm-execution/pcg_instance.h    | 11 ++++-
 .../per_device_op_state_backing.dtg.toml      |  8 ++--
 .../include/realm-execution/realm_allocator.h | 15 ++++++-
 .../include/realm-execution/realm_context.h   | 27 ++++++++----
 .../tasks/impl/ff_handle_init_return_task.h   | 15 +++++++
 .../tasks/impl/ff_handle_init_task.h          | 15 +++++++
 .../realm-execution/tasks/impl/index.dox      | 36 ++++++++++++----
 .../realm-execution/tasks/impl/op_task.h      |  6 ++-
 .../per_device_op_state_init_return_task.h    | 15 +++++++
 .../impl/per_device_op_state_init_task.h      | 15 +++++++
 .../realm-execution/tasks/realm_task_id_t.h   | 13 ------
 .../tasks/realm_task_registry.h               |  6 +++
 .../realm-execution/tasks/task_id_t.dtg.toml  | 14 +++----
 .../include/realm-execution/tasks/task_id_t.h | 13 ++++++
 .../tensor_instance_backing.dtg.toml          |  9 ++--
 .../realm-execution/tensor_instance_backing.h |  9 ++++
 lib/realm-execution/index.dox                 |  7 ++--
 .../realm-execution/instance_allocation.cc    |  6 +--
 .../realm-execution/tasks/realm_task_id_t.cc  | 10 -----
 .../tasks/realm_task_registry.cc              |  2 +-
 .../src/realm-execution/tasks/task_id_t.cc    |  5 +++
 lib/utils/include/utils/graph/index.dox       |  2 +
 .../sp_ization/escribano_algo.h               |  4 +-
 .../sp_ization/flexible_algo.h                |  4 +-
 .../sp_ization/{README.md => index.dox}       | 41 +++++++++++++------
 .../sp_ization/naive_stratum_sync.h           |  4 +-
 .../sp_ization/work_duplicating_sp_ization.h  |  8 +---
 45 files changed, 290 insertions(+), 110 deletions(-)
 rename bin/{sp_ization_benchmarking => sp-ization-benchmarking}/CMakeLists.txt (100%)
 rename bin/{sp_ization_benchmarking => sp-ization-benchmarking/include/sp-ization-benchmarking}/distributions.h (100%)
 rename bin/{sp_ization_benchmarking => sp-ization-benchmarking/include/sp-ization-benchmarking}/nasnet_bench_graph_generator.h (100%)
 rename bin/{sp_ization_benchmarking => sp-ization-benchmarking/include/sp-ization-benchmarking}/sample_graphs.h (100%)
 create mode 100644 bin/sp-ization-benchmarking/index.dox
 rename bin/{sp_ization_benchmarking => sp-ization-benchmarking/src/sp-ization-benchmarking}/distributions.cc (100%)
 rename bin/{sp_ization_benchmarking/sp_ization_benchmarking.cc => sp-ization-benchmarking/src/sp-ization-benchmarking/main.cc} (99%)
 create mode 100644 docs/realm-api.dox
 delete mode 100644 lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
 delete mode 100644 lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc
 rename lib/utils/include/utils/graph/series_parallel/sp_ization/{README.md => index.dox} (80%)

diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index ac19f9011e..6855537460 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -7,11 +7,7 @@ if(FF_BUILD_VISUALIZATION_TOOL)
 endif()
 
 if(FF_BUILD_SP_IZATION_BENCHMARKING)
-  add_subdirectory(sp_ization_benchmarking)
-endif()
-
-if(FF_BUILD_ARG_PARSER)
-  add_subdirectory(arg_parser)
+  add_subdirectory(sp-ization-benchmarking)
 endif()
 
 if(FF_BUILD_BIN_EXPORT_MODEL_ARCH)
diff --git a/bin/export-model-arch/index.dox b/bin/export-model-arch/index.dox
index c969e6481f..349ed2209c 100644
--- a/bin/export-model-arch/index.dox
+++ b/bin/export-model-arch/index.dox
@@ -3,6 +3,8 @@ namespace FlexFlow {
 
 @page export-model-arch export-model-arch
 
+\brief Exports the model computation graphs defined in the @ref models library as either JSON (for use outside of FlexFlow) or as DOT (for visualization). Can also optionally export the SP decompositions of the computation graphs.
+
 A tool for exporting (for details of the file format, see \ref file-format) and visualizing the model ComputationGraphs defined in @ref models.
 To build and run \c export-model-arch, run the following commands from the root of the %FlexFlow %Train repository:
 
diff --git a/bin/index.dox b/bin/index.dox
index 5e688a54d6..4944e50067 100644
--- a/bin/index.dox
+++ b/bin/index.dox
@@ -4,8 +4,9 @@
 
 This directory contains command-line interfaces for %FlexFlow %Train and associated tools (all in C++).
 
-- \subpage export-model-arch "": Exports the model computation graphs defined in the @ref models library as either JSON (for use outside of FlexFlow) or as DOT (for visualization). Can also optionally export the SP decompositions of the computation graphs.
-- \subpage substitution-to-dot "": Converts TASO-generated substitutions from the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)) into DOT for visualization.
-- \subpage protobuf-to-json "": Converts TASO-generated substitutions from the legacy protobuf format ([example](../substitutions/graph_subst_3_v2.pb)) to the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)). Will be removed in the future once the substitution generator is integrated natively into FlexFlow Train (tracked in [#351](https://github.com/flexflow/flexflow-train/issues/351)).
+- \subpage export-model-arch "": \copybrief export-model-arch
+- \subpage protobuf-to-json "": \copybrief protobuf-to-json
+- \subpage sp-ization-benchmarking "": \copybrief sp-ization-benchmarking
+- \subpage substitution-to-dot "": \copybrief substitution-to-dot
 
 */
diff --git a/bin/protobuf-to-json/index.dox b/bin/protobuf-to-json/index.dox
index 6c95370947..a49b0fbbd3 100644
--- a/bin/protobuf-to-json/index.dox
+++ b/bin/protobuf-to-json/index.dox
@@ -3,5 +3,10 @@ namespace FlexFlow {
 
 \page protobuf-to-json
 
+\brief Converts TASO-generated substitutions from the legacy protobuf format ([example](../substitutions/graph_subst_3_v2.pb)) to the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)). Will be removed in the future once the substitution generator is integrated natively into FlexFlow Train (tracked in [#351](https://github.com/flexflow/flexflow-train/issues/351)).
+
+\todo
+  \@lockshaw Add docs and example (s) for protobuf-to-json. See \ref export-model-arch for an example.
+
 */
 }
diff --git a/bin/sp_ization_benchmarking/CMakeLists.txt b/bin/sp-ization-benchmarking/CMakeLists.txt
similarity index 100%
rename from bin/sp_ization_benchmarking/CMakeLists.txt
rename to bin/sp-ization-benchmarking/CMakeLists.txt
diff --git a/bin/sp_ization_benchmarking/distributions.h b/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/distributions.h
similarity index 100%
rename from bin/sp_ization_benchmarking/distributions.h
rename to bin/sp-ization-benchmarking/include/sp-ization-benchmarking/distributions.h
diff --git a/bin/sp_ization_benchmarking/nasnet_bench_graph_generator.h b/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/nasnet_bench_graph_generator.h
similarity index 100%
rename from bin/sp_ization_benchmarking/nasnet_bench_graph_generator.h
rename to bin/sp-ization-benchmarking/include/sp-ization-benchmarking/nasnet_bench_graph_generator.h
diff --git a/bin/sp_ization_benchmarking/sample_graphs.h b/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/sample_graphs.h
similarity index 100%
rename from bin/sp_ization_benchmarking/sample_graphs.h
rename to bin/sp-ization-benchmarking/include/sp-ization-benchmarking/sample_graphs.h
diff --git a/bin/sp-ization-benchmarking/index.dox b/bin/sp-ization-benchmarking/index.dox
new file mode 100644
index 0000000000..9af57a7ff3
--- /dev/null
+++ b/bin/sp-ization-benchmarking/index.dox
@@ -0,0 +1,12 @@
+namespace FlexFlow {
+/**
+
+\page sp-ization-benchmarking
+
+\brief Executes evaluations for the various SP-ization algorithms in \ref spization.
+
+\todo
+  \@pietro Add usage docs and example(s) for sp-ization-benchmarking. See \ref export-model-arch for an example.
+
+*/
+}
diff --git a/bin/sp_ization_benchmarking/distributions.cc b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/distributions.cc
similarity index 100%
rename from bin/sp_ization_benchmarking/distributions.cc
rename to bin/sp-ization-benchmarking/src/sp-ization-benchmarking/distributions.cc
diff --git a/bin/sp_ization_benchmarking/sp_ization_benchmarking.cc b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/main.cc
similarity index 99%
rename from bin/sp_ization_benchmarking/sp_ization_benchmarking.cc
rename to bin/sp-ization-benchmarking/src/sp-ization-benchmarking/main.cc
index bc98a3a606..933ae535db 100644
--- a/bin/sp_ization_benchmarking/sp_ization_benchmarking.cc
+++ b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/main.cc
@@ -1,5 +1,5 @@
 /**
- * @file sp_ization_benchmarking.cpp
+ * @file main.cc
  * @brief Benchmarking different SP-ization techniques on various graphs.
  *
  * @details
@@ -22,9 +22,9 @@
  * run make and then ./sp_ization_benchmarking
  */
 
-#include "distributions.h"
-#include "nasnet_bench_graph_generator.h"
-#include "sample_graphs.h"
+#include "sp-ization-benchmarking/distributions.h"
+#include "sp-ization-benchmarking/nasnet_bench_graph_generator.h"
+#include "sp-ization-benchmarking/sample_graphs.h"
 #include "utils/graph/digraph/algorithms/transitive_reduction.h"
 #include "utils/graph/digraph/digraph_view.h"
 #include "utils/graph/node/algorithms.h"
diff --git a/bin/substitution-to-dot/index.dox b/bin/substitution-to-dot/index.dox
index abbcbed5c6..15ee221f93 100644
--- a/bin/substitution-to-dot/index.dox
+++ b/bin/substitution-to-dot/index.dox
@@ -3,5 +3,10 @@ namespace FlexFlow {
 
 \page substitution-to-dot
 
+\brief Converts TASO-generated substitutions from the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)) into DOT for visualization.
+
+\todo
+  \@lockshaw, Add usage docs and example(s) for substitution-to-dot. See \ref export-model-arch for an example.
+
 */
 }
diff --git a/contributing.dox b/contributing.dox
index ae643097bb..80373bf2ff 100644
--- a/contributing.dox
+++ b/contributing.dox
@@ -4,7 +4,7 @@
 
 \section contributing-setup Setup
 
-\note If you are developing on Stanford's sapling cluster, instead see the instructions \ref sapling-setup "here". If you don't know what this means, you're not using sapling so you should just continue reading.
+\note If you are developing on Stanford's sapling cluster, instead see the instructions \subpage sapling-setup "here". If you don't know what this means, you're not using sapling so you should just continue reading.
 
 1. %FlexFlow %Train uses <a href="https://nix.dev/manual/nix/2.24/">nix</a> to manage dependencies and the development environment.
    There exist a number of ways to install nix, but we recommend one of the following:
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index ef0b91d4d8..8fce95e656 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -947,7 +947,8 @@ INPUT                  = $(FF_HOME)/lib \
                          $(FF_HOME)/bin \
                          $(FF_HOME)/index.dox \
                          $(FF_HOME)/contributing.dox \
-                         $(FF_HOME)/docs/sapling.dox
+                         $(FF_HOME)/docs/sapling.dox \
+                         $(FF_HOME)/docs/realm-api.dox
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/docs/realm-api.dox b/docs/realm-api.dox
new file mode 100644
index 0000000000..8fe7aba443
--- /dev/null
+++ b/docs/realm-api.dox
@@ -0,0 +1,7 @@
+/**
+
+\page realm-api Realm API Reference
+
+- \anchor realm-instance <a href="https://legion.stanford.edu/realm/doc/main/classRealm_1_1RegionInstance.html">Realm::RegionInstance</a>
+
+*/
diff --git a/index.dox b/index.dox
index a97476dbf3..5417431a69 100644
--- a/index.dox
+++ b/index.dox
@@ -10,8 +10,8 @@ The bulk of the %FlexFlow source code is stored in the following folders:
 
 - \subpage lib "": The C++ code that makes up %FlexFlow's core, split up into a number of libraries. You can find a description of each library [here](./lib/README.md).
 - \subpage bin "": Command-line interfaces for %FlexFlow and associated tools (all in C++). Generally, these are just thin wrappers that parse command-line arguments and then call out to functions defined in \ref lib for the actual processing/logic. You can find a description of each binary \ref bin "here".
-- `bindings`: Python (or any additional languages added in the future) bindings for %FlexFlow %Train
-- `docs`: Config files for documentation generators and code for generating diagrams. The actual documentation itself is included in the source directories/files as either `.md` files or inline in the language's documentation syntax (i.e., <a href="https://www.doxygen.nl/manual/index.html">Doxygen</a> for C++ and <a href="https://www.sphinx-doc.org/en/master/">Sphinx</a> for Python).
+- `bindings`: Python (or any additional languages added in the future) bindings for %FlexFlow %Train. Still mostly unimplemented.
+- `docs`: Config files for documentation generators and code for generating diagrams. The actual documentation itself is included in the source directories/files in <a href="https://www.doxygen.nl/manual/index.html">Doxygen</a> syntax either in standalone `.dox` files or inline in header files.
 - `cmake`: CMake configuration for building %FlexFlow %Train. Note that unless you're modifying the build configuration (i.e., adding a library, additional dependencies, etc.), you generally should use \ref contributing-proj "proj" instead of interacting with CMake directly.
 
 \section root-contributing Contributing
diff --git a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
index 638e2f3b22..6891eca60d 100644
--- a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
+++ b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
@@ -9,7 +9,8 @@
 namespace FlexFlow {
 
 /**
- * @brief Turn a %Realm region instance into a GenericTensorAccessor.
+ * @brief Turn a %Realm region instance into a GenericTensorAccessor by
+ * re-wrapping the raw pointer.
  */
 DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
     Realm::RegionInstance inst,
diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
index 95530c0eee..a9dfb5d9c3 100644
--- a/lib/realm-execution/include/realm-execution/instance_allocation.h
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -7,17 +7,31 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief Allocates a (potentially remote) %Realm instance for \param value
+ * on the device represented by \param device_coord.
+ */
 std::pair<Realm::RegionInstance, Realm::Event>
-    perform_instance_allocation_for_value(DynamicNodeAttrs const &node,
+    perform_instance_allocation_for_value(MachineSpaceCoordinate const &device_coord,
                                           DynamicValueAttrs const &value,
                                           RealmContext &ctx);
 
+/**
+ * @brief Allocates the (potentially remote) %Realm instances for all of the
+ * values in \param g, excluding the preallocated values in \param preallocated,
+ * using \ref perform_instance_allocation_for_value.
+ *
+ * \relates TensorInstanceBacking
+ */
 TensorInstanceBacking perform_instance_allocation(
     DynamicOpenDataflowGraph const &g,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &preallocated,
     RealmContext &ctx);
 
+/**
+ * @brief Destroys all of the instances held in \param instances.
+ */
 void destroy_instances(TensorInstanceBacking const &instances,
                        Realm::Event precondition);
 
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance.h
index a966c9a01b..80a6b6e708 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance.h
@@ -54,13 +54,15 @@ struct PCGInstance {
 
   void update_optimizer_attrs_for_next_iter();
 
-  // getters
+  /** \name Getters **/
+  ///\{
   RealmContext &get_realm_context();
   std::vector<DynamicNodeInvocation> const &get_execution_order() const;
   TensorInstanceBacking const &get_tensor_instance_backing() const;
   PerDeviceOpStateBacking const &get_device_state_backing() const;
   OptimizerAttrs const &get_optimizer_attrs() const;
   std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
+  ///\}
 
 private:
   RealmContext &ctx;
@@ -71,6 +73,11 @@ struct PCGInstance {
   std::optional<Realm::RegionInstance> logit_grad_tensor;
 };
 
+/**
+ * \brief Creates a PCGInstance. Should generally be used instead of PCG&nstance::PCGInstance.
+ *
+ * \relates PCGInstance
+ */
 PCGInstance create_pcg_instance(
     RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
@@ -92,6 +99,8 @@ PCGInstance create_pcg_instance(
  * - \ref perform_forward_pass_for_pcg_instance
  * - \ref perform_backward_pass_for_pcg_instance
  * - \ref perform_update_pass_for_pcg_instance
+ *
+ * \relates PCGInstance
  */
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_all_passes_for_pcg_instance(
diff --git a/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
index b0ba11f5b4..89feb11905 100644
--- a/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
@@ -3,11 +3,9 @@ name = "PerDeviceOpStateBacking"
 type = "struct"
 features = []
 docstring = '''
-/**
- * \brief Maps each shard-expanded DynamicNodeInvocation to its corresponding PerDeviceOpState.
- *
- * PerDeviceOpStateBacking is to PerDeviceOpState as DistributedDeviceHandle is to \ref device_handle_t (i.e., FFHandle).
- */
+\brief Maps each shard-expanded DynamicNodeInvocation to its corresponding PerDeviceOpState.
+
+PerDeviceOpStateBacking is to PerDeviceOpState as DistributedDeviceHandle is to \ref device_handle_t (i.e., FFHandle).
 '''
 
 
diff --git a/lib/realm-execution/include/realm-execution/realm_allocator.h b/lib/realm-execution/include/realm-execution/realm_allocator.h
index d716016676..b3bc277c73 100644
--- a/lib/realm-execution/include/realm-execution/realm_allocator.h
+++ b/lib/realm-execution/include/realm-execution/realm_allocator.h
@@ -6,8 +6,16 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief An IAllocator instance that performs/manages each allocation as a
+ * \ref realm-instance "Realm Instance".
+ *
+ * \note As with the other instances of IAllocator, You generally want to use
+ * \ref get_realm_allocator rather than explicitly calling the constructor of
+ * RealmAllocator.
+ */
 struct RealmAllocator : public IAllocator {
-  RealmAllocator(Realm::Processor processor, Realm::Memory memory);
+  explicit RealmAllocator(Realm::Processor processor, Realm::Memory memory);
 
   RealmAllocator() = delete;
   RealmAllocator(RealmAllocator const &) = delete;
@@ -26,6 +34,11 @@ struct RealmAllocator : public IAllocator {
 };
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmAllocator);
 
+/**
+ * \brief Creates a RealmAllocator instance as an Allocator.
+ *
+ * \relates RealmAllocator
+ */
 Allocator get_realm_allocator(Realm::Processor processor, Realm::Memory memory);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 2aba46e47a..a616851975 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -29,17 +29,22 @@ struct RealmContext {
   RealmContext(RealmContext const &) = delete;
   RealmContext(RealmContext &&) = delete;
 
-  // Device mapping
+  /** \name Device mapping */
+  ///\{
   Realm::Processor
       map_device_coord_to_processor(MachineSpaceCoordinate const &);
   static Realm::Memory get_nearest_memory(Realm::Processor);
+  ///\}
 
-  // Current device context
+  /** \name Current device context */
+  ///\{
   Realm::Processor get_current_processor() const;
   Allocator &get_current_device_allocator();
   device_id_t get_current_device_idx() const;
+  ///\}
 
-  // Task creation
+  /** \name Task creation */
+  ///\{
   Realm::Event spawn_task(Realm::Processor proc,
                           task_id_t task_id,
                           void const *args,
@@ -55,20 +60,28 @@ struct RealmContext {
                             size_t arglen,
                             Realm::Event wait_on = Realm::Event::NO_EVENT,
                             int priority = 0);
+  ///\}
 
-  // Instance management
+  /** \name Instance management */
+  ///\{
   std::pair<Realm::RegionInstance, Realm::Event>
       create_instance(Realm::Memory memory,
                       TensorShape const &shape,
                       Realm::ProfilingRequestSet const &prs,
                       Realm::Event wait_on = Realm::Event::NO_EVENT);
+  ///\}
 
-  // Get the current set of outstanding events
+  /**
+   * \brief Get the current set of outstanding events
+   */
   Realm::Event get_outstanding_events();
 
 protected:
-  // Compact AND CLEAR the outstanding event queue
-  // Important: USER MUST BLOCK on event or else use it, or it WILL BE LOST
+  /**
+   * \brief Compact **and clear** the outstanding event queue
+   *
+   * \warning **User must block** on event or else use it, or it **will be lost**.
+   */
   [[nodiscard]] Realm::Event merge_outstanding_events();
 
   void discover_machine_topology();
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
index cf45cd8b67..f7de2b1293 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
@@ -7,9 +7,24 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief The function registered as a %Realm task for returning the
+ * asynchronously-initialized FFHandle. Dispatched by \ref
+ * spawn_ff_handle_init_return_task.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
 void ff_handle_init_return_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
+/**
+ * \brief Launches the task (\ref ff_handle_init_return_task_body) for returning
+ * the asynchronously-initialized FFHandle.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
 Realm::Event spawn_ff_handle_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
index 89485100af..8588816576 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
@@ -7,9 +7,24 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief The function registered as a %Realm task for starting the asynchronous
+ * initialization of the FFHandle. Dispatched by \ref
+ * spawn_ff_handle_init_task.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
 void ff_handle_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
+/**
+ * \brief Launches the task (\ref ff_handle_init_return_task_body) for starting
+ * the asynchronous initialization of the FFHandle.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
 Realm::Event spawn_ff_handle_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
index 89e4e9642e..5c87839040 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
@@ -3,14 +3,34 @@ namespace {
 
 \page realm-execution-tasks tasks/
 
-\c realm-execution groups tasks into four kinds:
+\c realm-execution groups tasks into four kinds (\ref tasks-controller-tasks, \ref tasks-op-task, \ref tasks-ffhandle-init, and \ref tasks-op-state-init), each which is implemented using one of two patterns (\ref tasks-one-part or \ref tasks-two-part).
 
-- \ref controller_task.h "Controller Tasks": At most one of these per machine. Runs the \ref term-controller.
-- \ref op_task.h "Operator Tasks":
-- \ref ff_handle_init_task.h "FF Handle Init Tasks":
-- \ref ff_handle_init_return_task.h "FF Handle Init Return Tasks":
-- \ref per_device_op_state_init_task.h "Per Device Op State Init Tasks":
-- \ref per_device_op_state_init_return_task.h "Per Device Op State Init Return Tasks":
+\section tasks-one-part Individual Tasks
+
+Invidividual tasks are just normal %Realm tasks, which are implemented in \ref realm-execution as a
+wrapper function for spawning a task (e.g., \ref collective_spawn_controller_task) and a task body which is the actual %Realm task implementation (e.g., \ref controller_task_body). Each also has an optional corresponding <em>TaskArgument</em> (e.g., OpTaskArgs) object to provide a structure to the arguments passed from the wrapper to the task body. In cases where the %TaskArgument object is not trivially JSON-serializable, a corresponding JSON-serializable task argument type is provided (e.g., SerializeableOpTaskArgs).
+
+\subsection tasks-controller-tasks Controller Tasks
+
+Runs the \ref term-controller. At most one of these per machine (i.e., this task is a singleton in the object-oriented sense of the word). Implemented in \ref controller_task.h.
+
+\subsection tasks-op-task Operator Tasks
+
+Implements all of the operator tasks, i.e., the tasks that are executed during training (i.e., forward, backward, update/optimizer, and loss tasks). Implented in \ref op_task.h.
+
+\section tasks-two-part Paired Tasks
+
+The other two types of tasks are implemented as pairs of tasks: one to begin initializing a value (e.g., \ref spawn_ff_handle_init_task), and another to return the initialized value when it's ready (e.g., \ref spawn_ff_handle_init_return_task). As with \ref task-one-part, they have an optional corresponding tasks argument type and a potential serializable task argument type.
+
+\todo \@Elliott why is the paired tasks structure required? Is it a performance optimization, or simply necessary given the set of primitives %Realm provides?
+
+\subsection tasks-ffhandle-init FFHandle Initialization Tasks
+
+For initializing the FFHandle for each %GPU. Implemented in \ref ff_handle_init_task.h and \ref ff_handle_init_return_task.h.
+
+\subsection tasks-op-state-init PerDeviceOpState Initialization Tasks
+
+For initializing the PerDeviceOpState for each shard of an operator task. Implemented in \ref per_device_op_state_init_task.h and \ref per_device_op_state_init_return_task.h.
 
-}
 */
+}
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 7853444bfe..29c49aa5fa 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -23,7 +23,8 @@ namespace FlexFlow {
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Launches the task for a DynamicNodeInvocation using realm.
+ * \brief Launches the task (\ref op_task_body, for a DynamicNodeInvocation
+ * using %Realm.
  *
  * The task launch process functions a bit differently to that used in the
  * previous FlexFlow codebase. Rather than having a function registered with
@@ -47,6 +48,9 @@ void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
  * grab the corresponding pointer/GenericTensorAccessor, and then use
  * LocalTaskArgumentAccessor for the actual argument access as, by this point,
  * everything is local.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
  */
 Realm::Event spawn_op_task(
     RealmContext &ctx,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
index 7027ad7555..5d8b0d4beb 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
@@ -8,9 +8,24 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief The function registered as a %Realm task for returning the
+ * asynchronously-initialized PerDeviceOpState. Dispatched by \ref
+ * spawn_per_device_op_state_init_return_task.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
 void per_device_op_state_init_return_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
+/**
+ * \brief Launches the task (\ref per_device_op_state_init_return_task_body) for returning
+ * the asynchronously-initialized PerDeviceOpState.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
 Realm::Event spawn_per_device_op_state_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
index 1c7db6e0d4..20cf1c3e5f 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
@@ -14,9 +14,24 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief The function registered as a %Realm task for starting the asynchronous
+ * initialization of the PerDeviceOpState. Dispatched by \ref
+ * spawn_per_device_op_state_init_task.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
 void per_device_op_state_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
+/**
+ * \brief Launches the task (\ref ff_handle_init_return_task_body) for starting
+ * the asynchronous initialization of the PerDeviceOpState.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
 std::optional<Realm::Event> spawn_per_device_op_state_init_task(
     RealmContext &ctx,
     Realm::Processor target_proc,
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
deleted file mode 100644
index a3c6891fb0..0000000000
--- a/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_ID_T_H
-#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_ID_T_H
-
-#include "realm-execution/realm.h"
-#include "realm-execution/tasks/task_id_t.dtg.h"
-
-namespace FlexFlow {
-
-Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
index 8114f1a82c..293821e2e6 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -6,6 +6,9 @@
 
 namespace FlexFlow {
 
+/**
+ * \relates task_id_t
+ */
 [[nodiscard]] Realm::Event register_task(Realm::Processor::Kind target_kind,
                                          task_id_t func_id,
                                          void (*task_body)(void const *,
@@ -14,6 +17,9 @@ namespace FlexFlow {
                                                            size_t,
                                                            Realm::Processor));
 
+/**
+ * \brief Registers all known tasks (using \ref register_task).
+ */
 [[nodiscard]] Realm::Event register_all_tasks();
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
index d0abb95f5a..ea41f63d3f 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
@@ -7,19 +7,19 @@ features = [
   "rapidcheck",
   "json",
 ]
-docstring = """
-@brief An enum for identifying tasks for use in the realm runtime.
+docstring = '''
+\brief An enum for identifying tasks for use in the realm runtime.
 
-@note Many of these are pulled over from the old FlexFlow codebase and are no
+\note Many of these are pulled over from the old FlexFlow codebase and are no
 longer in use. Eventually these should be pruned down to the set of tasks we're
 actually using.
 
-@note @ref task_id_t is used by the realm runtime (i.e., `realm-execution`),
-but not by realm directly: realm-execution uses @ref
-get_realm_task_id_for_task_id to convert every @ref task_id_t into a
+\note @ref task_id_t is used by the realm runtime (i.e., \ref realm-execution),
+but not by realm directly: realm-execution uses \ref
+get_realm_task_id_for_task_id to convert every \ref task_id_t into a
 Realm::Processor::TaskFuncID, which is what is actually used for task launches,
 etc.
-"""
+'''
 
 [[values]]
 name = "CONTROLLER_TASK_ID"
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
index 53945d2e5b..4a14156c60 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
@@ -6,9 +6,14 @@
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
 #include <optional>
+#include "realm-execution/realm.h"
 
 namespace FlexFlow {
 
+/**
+ * \brief Retrieves the \ref task_id_t for a DynamicNodeAttrs, with
+ * a return value of \ref std::nullopt to be treated as a no-op task.
+ */
 std::optional<task_id_t>
     get_task_id_for_op(DynamicNodeAttrs const &,
                        std::optional<OptimizerAttrs> const &);
@@ -23,6 +28,14 @@ std::optional<task_id_t> get_bwd_task_id_for_op_attrs(PCGOperatorAttrs const &);
 std::optional<task_id_t>
     get_update_task_id_for_optimizer_attrs(OptimizerAttrs const &);
 
+/**
+ * \brief Convert a FlexFlow::task_id_t into a %Realm task id.
+ *
+ * \relates task_id_t
+ */
+Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t);
+
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
index 929b8e5ce3..02a0e95ba1 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -6,11 +6,12 @@ features = [
   "fmt",
   #"hash",
 ]
-docstring = """
-@brief A simple container for mapping between DynamicValueAttrs and the corresponding realm instances/events.
+docstring = '''
+\brief A simple container for mapping between DynamicValueAttrs and the corresponding realm instances/events.
 
-@note The actual logic for doing instance allocation is in @ref perform_instance_allocation.
-"""
+\note The actual logic for doing instance allocation and destruction are in \ref perform_instance_allocation
+and \ref destroy_instances, respectively.
+'''
 
 includes = [
   "<unordered_map>",
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.h b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
index 72a8bf439a..93e525a349 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
@@ -6,8 +6,17 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief Make an empty TensorInstanceBacking.
+ *
+ * \relates TensorInstanceBacking
+ */
 TensorInstanceBacking make_empty_tensor_instance_backing();
 
+/**
+ * \brief Get the subset of the given TensorInstanceBacking necessary to execute
+ * the given DynamicNodeInvocation.
+ */
 TensorInstanceBacking subset_tensor_instance_backing_for_invocation(
     TensorInstanceBacking const &, DynamicNodeInvocation const &);
 
diff --git a/lib/realm-execution/index.dox b/lib/realm-execution/index.dox
index e8f24507cc..d43418f03f 100644
--- a/lib/realm-execution/index.dox
+++ b/lib/realm-execution/index.dox
@@ -3,8 +3,7 @@ namespace FlexFlow {
 
 \page realm-execution realm-execution
 
-\brief Executes distributed \ref MappedParallelComputationGraph ""s using realm, primarily by lowering them to distributed \ref DynamicOpenDataflowGraph ""s using \ref task-spec
-       Used both for testing and (eventually by \ref realm-execution) for fusing operator task launches.
+\brief Executes distributed \ref MappedParallelComputationGraph ""s using realm, primarily by lowering them to distributed \ref DynamicOpenDataflowGraph ""s using \ref task-spec. Used both for testing and (eventually by \ref realm-execution) for fusing operator task launches.
 
 The %Realm backend for distributed execution.
 
@@ -19,7 +18,7 @@ This is a single-controller implementation. That means the controller (the task
 - \ref PCGInstance "": \copybrief PCGInstance
 - \ref RealmManager "": \copybrief RealmManager
 - \ref RealmContext "": \copybrief RealmContext
-- \ref "include/realm-execution/tasks": The Realm task implementations and their supporting infrastructure.
+- \subpage realm-execution-tasks "include/realm-execution/tasks": The Realm task implementations and their supporting infrastructure.
   - \ref "lib/realm-execution/include/realm-execution/tasks/impl" "impl/": the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
   - \ref "lib/realm-execution/include/realm-execution/tasks/serializer/" "serializer/": additional support for serializing %Realm data types.
   - \ref realm_task_registry.h: Manages the registration of %Realm tasks. All %Realm tasks go through this interface.
@@ -30,7 +29,7 @@ This is a single-controller implementation. That means the controller (the task
   - \ref "distributed_device_state_initialization.h": performs device state initialization of dynamic graph nodes and returns the resulting PerDeviceOpStateBacking.
   - \ref "instance_allocation.h": allocates instances for tensors in the dynamic graph and returns the resulting TensorInstanceBacking.
 
-\section realm-execution-todo TODO
+\section realm-execution-todo Outstanding TODOs
 
 - external instances
 - copies
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index e003e5b71a..2801a70940 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -22,14 +22,13 @@
 namespace FlexFlow {
 
 std::pair<Realm::RegionInstance, Realm::Event>
-    perform_instance_allocation_for_value(DynamicNodeAttrs const &node,
+    perform_instance_allocation_for_value(MachineSpaceCoordinate const &device_coord,
                                           DynamicValueAttrs const &value,
                                           RealmContext &ctx) {
   ASSERT(value.accessor == std::nullopt);
 
   TensorShape shape = get_piece_shape(value.parallel_tensor_shape.value());
 
-  MachineSpaceCoordinate device_coord = assert_unwrap(node.device_coord);
   Realm::Processor proc = ctx.map_device_coord_to_processor(device_coord);
   Realm::Memory memory = ctx.get_nearest_memory(proc);
   return ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
@@ -53,8 +52,9 @@ TensorInstanceBacking perform_instance_allocation(
       NOT_IMPLEMENTED();
     } else {
       if (!contains_key(result.backing, v)) {
+        MachineSpaceCoordinate device_coord = assert_unwrap(n.device_coord);
         result.backing.insert(
-            std::pair{v, perform_instance_allocation_for_value(n, v, ctx)});
+            std::pair{v, perform_instance_allocation_for_value(device_coord, v, ctx)});
       }
       return result.backing.at(v);
     }
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc
deleted file mode 100644
index ec1aa143a6..0000000000
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "realm-execution/tasks/realm_task_id_t.h"
-
-namespace FlexFlow {
-
-Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t task_id) {
-  return Realm::Processor::TASK_ID_FIRST_AVAILABLE +
-         static_cast<Realm::Processor::TaskFuncID>(task_id);
-}
-
-} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index 6cc4ff4d02..0372dcdfeb 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -5,7 +5,7 @@
 #include "realm-execution/tasks/impl/device_state_init_return_task.h"
 #include "realm-execution/tasks/impl/device_state_init_task.h"
 #include "realm-execution/tasks/impl/op_task.h"
-#include "realm-execution/tasks/realm_task_id_t.h"
+#include "realm-execution/tasks/task_id_t.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
diff --git a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
index 94e1b887e7..dd4b0a66ca 100644
--- a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
@@ -190,4 +190,9 @@ std::optional<task_id_t> get_update_task_id_for_optimizer_attrs(
   });
 }
 
+Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t task_id) {
+  return Realm::Processor::TASK_ID_FIRST_AVAILABLE +
+         static_cast<Realm::Processor::TaskFuncID>(task_id);
+}
+
 } // namespace FlexFlow
diff --git a/lib/utils/include/utils/graph/index.dox b/lib/utils/include/utils/graph/index.dox
index dcd1004761..355117497f 100644
--- a/lib/utils/include/utils/graph/index.dox
+++ b/lib/utils/include/utils/graph/index.dox
@@ -3,6 +3,8 @@ namespace FlexFlow {
 
 \page utils-graph utils/graph
 
+- \subpage spization
+
 \note This documentation is somewhat out of date and, more importantly, \c utils/graph is in rather dire need of a reorganization, so take these docs with a grain of salt.
 
 \section design-considerations Design Considerations
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/escribano_algo.h b/lib/utils/include/utils/graph/series_parallel/sp_ization/escribano_algo.h
index 8d5937427d..60d3aa6aa9 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/escribano_algo.h
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/escribano_algo.h
@@ -18,9 +18,7 @@ std::unordered_set<Node>
                   std::unordered_map<Node, NodeRole> const &node_roles);
 
 /**
- * @brief See @ref
- * lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
- * "README.md" for explanation.
+ * \brief See \ref spization-escribano.
  */
 SeriesParallelDecomposition escribano_sp_ization(DiGraph g);
 
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/flexible_algo.h b/lib/utils/include/utils/graph/series_parallel/sp_ization/flexible_algo.h
index a6f5a8d34a..93a4e29fa2 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/flexible_algo.h
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/flexible_algo.h
@@ -13,9 +13,7 @@
 namespace FlexFlow {
 
 /**
- * @brief See @ref
- * lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
- * "README.md" for explanation.
+ * \brief See \ref spization-flexible.
  */
 SeriesParallelDecomposition
     flexible_sp_ization(DiGraphView const &g,
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/README.md b/lib/utils/include/utils/graph/series_parallel/sp_ization/index.dox
similarity index 80%
rename from lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
rename to lib/utils/include/utils/graph/series_parallel/sp_ization/index.dox
index 28a8ce7823..62d6ef542d 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/index.dox
@@ -1,4 +1,7 @@
-# SP-ization
+namespace FlexFlow {
+/**
+
+\page spization utils/graph/series_parallel/sp_ization/
 
 As a refresher, a series-parallel decomposition (SPD) is an algebraic datatype that looks as follows:
 ```haskell
@@ -19,9 +22,11 @@ We have 2 main ways of achieving this:
 1. **Work (Node) Duplicating SP-ization**: preserves the critical path, but may duplicate nodes
 2. **Dependency (Edge) Addition SP-ization**: preserves the set of nodes, but may add edges
 
-## Node (Work) Duplicating SP-ization
+\section spization-work-duplicating Node (Work) Duplicating SP-ization
+
+\subsection spization-naive-work-duplicating Naive Work Duplicating
 
-### Naive ([work_duplicating_sp_ization.h](work_duplicating_sp_ization.h))
+Implemented in \ref work_duplicating_sp_ization.h (more specifically, \ref naive_work_duplicating_sp_ization).
 
 Transforms a directed acyclic graph (DAG) into a Series Parallel (SP) graph. The critical path cost is unchanged, and the SP-ization is done solely through node duplication.
 
@@ -62,7 +67,9 @@ digraph SP {
 
 We can roughly think of it as the parallel composition of all the possible paths from source to sink.
 
-### With Coalescing ([work_duplicating_sp_ization.h](work_duplicating_sp_ization.h))
+\subsection spization-with-coalescing With Coalescing
+
+Implemented in \ref work_duplicating_sp_ization.h (more specifically, \ref work_duplicating_sp_ization_with_coalescing).
 
 Transforms a directed acyclic graph (DAG) into a Series Parallel (SP) graph with coalescing. The critical path cost is unchanged, and the SP-ization is done solely through node (work) duplication.
 
@@ -94,11 +101,13 @@ digraph SP {
 }
 ```
 
-## Dependency Addition SP-ization
+\section spization-dependency-addition Dependency Addition SP-ization
 
-### Naive Stratum Sync ([naive_stratum_sync.h](naive_stratum_sync.h))
+\subsection spization-naive Naive Stratum Sync
 
-`naive_stratum_sync_sp_ization` transforms a directed acyclic graph (DAG) into a Series Parallel (SP) graph. The total number of nodes remains unchanged, and the SP-ization is done solely through edge (dependency) addition.
+Implemented in \ref naive_stratum_sync.h (more specifically, \ref naive_stratum_sync_sp_ization).
+
+\ref naive_stratum_sync_sp_ization transforms a directed acyclic graph (DAG) into a Series Parallel (SP) graph. The total number of nodes remains unchanged, and the SP-ization is done solely through edge (dependency) addition.
 
 The graph is first partitioned into strata: the i\_th stratum contains all the nodes whose critical path length has length i. The nodes in a given stratum are composed in parallel, and the strata are serially composed in succession.
 
@@ -127,9 +136,12 @@ digraph SP {
 }
 ```
 
-### Escribano Algorithm ([escribano_algo.h](escribano_algo.h))
+\subsection spization-escribano Escribano Algorithm
+
+Implemented in \ref escribano_algo.h (more specifically, \ref escribano_sp_ization).
+
+Paper can be found <a href="https://www.infor.uva.es/wp-content/uploads/2016/10/IT-DI-2002-0002.pdf">here</a>.
 
-Paper is present here: https://www.infor.uva.es/wp-content/uploads/2016/10/IT-DI-2002-0002.pdf.
 In the naive stratum sync algorithm, we add an all-to-all connection between all nodes in one stratum and the next. The escribano algorithm by contrast, leverages the fact that it might be possible to synchronize consecutive strata by adding smaller, more local connections that still yield a valid SP-ization graph.
 
 Example:
@@ -148,21 +160,23 @@ digraph G {
 }
 ```
 
-The strata are: {0}, {1, 2, 3}, {4, 5, 6}, {7}.
+The strata are: `{0}, {1, 2, 3}, {4, 5, 6}, {7}`.
 
 The naive stratum sync yields the following, adding an all-to-all connection between consecutive strata:
 ```
 S(0, P(1, 2, 3), P(4, 5, 6), 7)
 ```
 
-While the escribano algorithm is able to identify that strata 1 and 2 can be synced without adding an all-to-all connection: nodes {1, 2} only connect to {4, 5}, and node {3} only connects to {6}. It thus yields the following:
+While the escribano algorithm is able to identify that strata 1 and 2 can be synced without adding an all-to-all connection: nodes `{1, 2}` only connect to `{4, 5}`, and node `{3}` only connects to `{6}`. It thus yields the following:
 ```
 S(0, P(S(P(1, 2), P(4, 5)), S(3, 6)), 7)
 ```
 
 Our implementation, rather than building the SPD one stratum at a time, builds it one node at a time.
 
-### Flexible Algorithm ([flexible_algo.h](flexible_algo.h))
+\subsection spization-flexible Flexible Algorithm
+
+Implemented in \ref flexible_algo.h (more specifically, \ref flexible_sp_ization).
 
 Consider the following N-graph:
 
@@ -192,3 +206,6 @@ The flexible algorithm expands the escribano algorithm by generalizing it to suc
 In the escribano algorithm, once the sync area (the "forest") is identified, the partition into up and down sets is fixed: up is everything but the last layer, down is the last layer. But this is an arbitrary choice; there are multiple valid ways to partition the forest into an up set and a down set (across which we sync).
 
 The flexible algorithm exploits this by searching across all valid up/down partitions of the forest and selecting the one that minimizes the sum of critical path costs of the up and down subgraphs (i.e., the critical path cost of the resulting SP-ized subgraph after the sync).
+
+*/
+}
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/naive_stratum_sync.h b/lib/utils/include/utils/graph/series_parallel/sp_ization/naive_stratum_sync.h
index 8cf38a1575..c782497155 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/naive_stratum_sync.h
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/naive_stratum_sync.h
@@ -7,9 +7,7 @@
 namespace FlexFlow {
 
 /**
- * @brief See @ref
- *lib/utils/include/utils/graph/series_parallel/sp_ization/README.md "README.md"
- *for explanation.
+ * \brief See \ref spization-naive.
  **/
 SeriesParallelDecomposition naive_stratum_sync_sp_ization(DiGraphView const &g);
 
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/work_duplicating_sp_ization.h b/lib/utils/include/utils/graph/series_parallel/sp_ization/work_duplicating_sp_ization.h
index 8973d44a6d..c6dd87d2a0 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/work_duplicating_sp_ization.h
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/work_duplicating_sp_ization.h
@@ -8,17 +8,13 @@
 namespace FlexFlow {
 
 /**
- * @brief See @ref
- * lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
- * "README.md" for explanation.
+ * \brief See \ref spization-naive-work-duplicating.
  */
 SeriesParallelDecomposition
     naive_work_duplicating_sp_ization(DiGraphView const &g);
 
 /**
- * @brief See @ref
- * lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
- * "README.md" for explanation.
+ * @brief See \ref spization-with-coalescing.
  */
 SeriesParallelDecomposition
     work_duplicating_sp_ization_with_coalescing(DiGraphView const &g);

From 8994914356a1d51dfc94a93481cc390947919781 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Tue, 10 Mar 2026 15:32:06 -0700
Subject: [PATCH 096/113] Apply suggestions from code review

Co-authored-by: Elliott Slaughter <elliottslaughter@gmail.com>
---
 contributing.dox                                       | 10 +++++-----
 lib/local-execution/index.dox                          |  5 +++--
 .../include/realm-execution/pcg_instance.h             |  6 +++---
 .../include/realm-execution/realm_allocator.h          |  2 +-
 .../include/realm-execution/realm_context.h            |  4 ++--
 .../include/realm-execution/realm_manager.h            |  7 ++++++-
 .../include/realm-execution/tasks/impl/op_task.h       |  6 +++---
 .../tasks/impl/per_device_op_state_init_task.h         |  2 +-
 .../realm-execution/tasks/realm_task_registry.h        |  4 ++++
 .../include/realm-execution/tasks/task_id_t.dtg.toml   |  2 +-
 .../include/realm-execution/tasks/task_id_t.h          |  2 +-
 .../realm-execution/tensor_instance_backing.dtg.toml   |  2 +-
 lib/realm-execution/index.dox                          |  4 ++--
 lib/substitutions/index.dox                            |  2 +-
 .../include/task-spec/dynamic_graph/index.dox          |  2 +-
 15 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/contributing.dox b/contributing.dox
index 80373bf2ff..a420f950af 100644
--- a/contributing.dox
+++ b/contributing.dox
@@ -4,15 +4,15 @@
 
 \section contributing-setup Setup
 
-\note If you are developing on Stanford's sapling cluster, instead see the instructions \subpage sapling-setup "here". If you don't know what this means, you're not using sapling so you should just continue reading.
+\note If you are developing on Stanford's Sapling cluster, instead see the instructions \subpage sapling-setup "here". If you don't know what this means, you're not using Sapling so you should just continue reading.
 
-1. %FlexFlow %Train uses <a href="https://nix.dev/manual/nix/2.24/">nix</a> to manage dependencies and the development environment.
-   There exist a number of ways to install nix, but we recommend one of the following:
+1. %FlexFlow %Train uses <a href="https://nix.dev/manual/nix/2.24/">Nix</a> to manage dependencies and the development environment.
+   There exist a number of ways to install Nix, but we recommend one of the following:
 
    1. If you have root permissions: [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)
 
    2. If you don't have root permissions: [DavHau/nix-portable](https://github.com/DavHau/nix-portable).
-      Note that nix-portable does not work particularly well if the nix store is in NFS \ref contributing-footnote-1 "[1]" or other distributed file systems,
+      Note that nix-portable does not work particularly well if the Nix store is in NFS \ref contributing-footnote-1 "[1]" or other distributed file systems,
       so if you are running on an HPC cluster where the home directory is mounted via a distributed file system we recommend setting the
       <tt>NP_LOCATION</tt> environment to <tt>/tmp</tt> or some other non-NFS location.
 
@@ -118,7 +118,7 @@ with a detailed description of your platform and the commands you have run.
 
 \subsection contributing-nix-direnv nix-direnv (optional)
 
-If you installed nix system-wide (e.g., using [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)),
+If you installed Nix system-wide (e.g., using [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)),
 you can use [direnv](https://direnv.net/) to automatically enter the %FlexFlow %Train development environment when you `cd` into the repository, rather
 than having to manually run `nix develop`.
 [direnv](https://direnv.net) will also automatically exit the environment when you `cd` out of the repository, and (if configured using [nix-direnv](https://github.com/nix-community/nix-direnv)) will even automatically reload the environment if the `flake.nix` file changes.
diff --git a/lib/local-execution/index.dox b/lib/local-execution/index.dox
index 0c7b06820e..9c82327404 100644
--- a/lib/local-execution/index.dox
+++ b/lib/local-execution/index.dox
@@ -3,8 +3,9 @@ namespace FlexFlow {
 
 \page local-execution local-execution
 
-\brief Executes non-distributed \ref DynamicOpenDataflowGraph on local devices without using realm.
-       Used both for testing and (eventually by \ref realm-execution) for fusing operator task launches.
+\brief Executes non-distributed \ref DynamicOpenDataflowGraph on local devices without using Realm.
+       Used for testing and inside of Realm to execute specific operators.
+       Future uses may also include fusing operator task launches in Realm.
 
 The primary external-facing interface of local-execution.
 
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance.h
index 80a6b6e708..50e0c4b623 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance.h
@@ -24,10 +24,10 @@
 namespace FlexFlow {
 
 /**
- * \brief The main public interface for the realm backend.
+ * \brief The main public interface for the Realm backend.
  * Takes a MappedParallelComputationGraph and lowers it through
  * DynamicOpenDataflowGraph to get the fully-specified execution order of tasks
- * to be executed. Also tracks the allocation of realm instances for tensors
+ * to be issued. (Note: this is a parallel execution so execution order may not match the order in which operations are issued.) Also tracks the allocation of realm instances for tensors
  * through its TensorInstanceBacking.
  *
  * \note PCGInstance is primarily just a container for the various structs held
@@ -74,7 +74,7 @@ struct PCGInstance {
 };
 
 /**
- * \brief Creates a PCGInstance. Should generally be used instead of PCG&nstance::PCGInstance.
+ * \brief Creates a PCGInstance. Should generally be used instead of PCGInstance::PCGInstance.
  *
  * \relates PCGInstance
  */
diff --git a/lib/realm-execution/include/realm-execution/realm_allocator.h b/lib/realm-execution/include/realm-execution/realm_allocator.h
index b3bc277c73..77af4a742c 100644
--- a/lib/realm-execution/include/realm-execution/realm_allocator.h
+++ b/lib/realm-execution/include/realm-execution/realm_allocator.h
@@ -10,7 +10,7 @@ namespace FlexFlow {
  * \brief An IAllocator instance that performs/manages each allocation as a
  * \ref realm-instance "Realm Instance".
  *
- * \note As with the other instances of IAllocator, You generally want to use
+ * \note As with the other instances of IAllocator, you generally want to use
  * \ref get_realm_allocator rather than explicitly calling the constructor of
  * RealmAllocator.
  */
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index a616851975..c2b1180be7 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -15,7 +15,7 @@
 namespace FlexFlow {
 
 /**
- * @brief An interface that wraps the rest of realm and protects against certain
+ * @brief An interface that wraps the rest of Realm and protects against certain
  * classes of bugs, such as shutdown bugs.
  *
  * @warn Do NOT call Realm directly unless you know what you are doing.
@@ -80,7 +80,7 @@ struct RealmContext {
   /**
    * \brief Compact **and clear** the outstanding event queue
    *
-   * \warning **User must block** on event or else use it, or it **will be lost**.
+   * \warning **User must block** on event or else use it, or it **will be lost** (potentially resulting in a shutdown hang).
    */
   [[nodiscard]] Realm::Event merge_outstanding_events();
 
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 89272c591a..fec10acd50 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -10,7 +10,7 @@
 namespace FlexFlow {
 
 /**
- * @brief Manages the initialization and shutdown of the realm runtime.
+ * @brief Manages the initialization and shutdown of the Realm runtime.
  * Provides the interface to launch the \ref term-controller that runs the rest of the computation
 * (i.e., \ref start_controller).
  */
@@ -23,6 +23,11 @@ struct RealmManager : private RealmContext {
   RealmManager(RealmManager const &) = delete;
   RealmManager(RealmManager &&) = delete;
 
+  /**
+   * @brief Launches the the \ref term-controller. Currently there is exactly one controller for the entire machine. The controller may be a function that closes over data (i.e., a lambda).
+   * 
+   * @warn If the provided function closes over data, **the user must block on the resulting event** to ensure it remains in scope until the controller completes.
+   */
   [[nodiscard]] Realm::Event
       start_controller(std::function<void(RealmContext &)>,
                        Realm::Event wait_on = Realm::Event::NO_EVENT);
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 29c49aa5fa..533df13380 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -23,14 +23,14 @@ namespace FlexFlow {
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Launches the task (\ref op_task_body, for a DynamicNodeInvocation
+ * \brief Launches the task (\ref op_task_body), for a DynamicNodeInvocation
  * using %Realm.
  *
  * The task launch process functions a bit differently to that used in the
  * previous FlexFlow codebase. Rather than having a function registered with
  * realm/legion for every task_id_t, we now have only a few functions
- * registered: \ref op_task_body, \ref device_handle_init_task_body,
- * \ref device_state_init_return_task_body, and \ref controller_task_body (see
+ * registered: \ref op_task_body, \ref ff_handle_init_task_body,
+ * \ref per_device_op_state_init_return_task_body, and \ref controller_task_body (see
  * \ref register_all_tasks for where this list comes from), and in fact only
  * \ref op_task_body is launched by \ref spawn_op_task. Each of these registered
  * tasks use the serialized arguments sent to them to dispatch to the correct
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
index 20cf1c3e5f..ac2f7d7c1d 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
@@ -26,7 +26,7 @@ void per_device_op_state_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Launches the task (\ref ff_handle_init_return_task_body) for starting
+ * \brief Launches the task (\ref per_device_op_state_init_task_body) for starting
  * the asynchronous initialization of the PerDeviceOpState.
  *
  * To understand how this fits into the broader structure of \ref
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
index 293821e2e6..6b0cc624b4 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -8,6 +8,8 @@ namespace FlexFlow {
 
 /**
  * \relates task_id_t
+ *
+ * \warning The event returned by this function <em>must be consumed</em> or else %Realm may not shut down properly.
  */
 [[nodiscard]] Realm::Event register_task(Realm::Processor::Kind target_kind,
                                          task_id_t func_id,
@@ -19,6 +21,8 @@ namespace FlexFlow {
 
 /**
  * \brief Registers all known tasks (using \ref register_task).
+ *
+ * \warning The event returned by this function <em>must be consumed</em> or else %Realm may not shut down properly.
  */
 [[nodiscard]] Realm::Event register_all_tasks();
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
index ea41f63d3f..b1e5e07e28 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
@@ -8,7 +8,7 @@ features = [
   "json",
 ]
 docstring = '''
-\brief An enum for identifying tasks for use in the realm runtime.
+\brief An enum for identifying tasks for use in the Realm runtime.
 
 \note Many of these are pulled over from the old FlexFlow codebase and are no
 longer in use. Eventually these should be pruned down to the set of tasks we're
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
index 4a14156c60..e8858d4451 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
@@ -29,7 +29,7 @@ std::optional<task_id_t>
     get_update_task_id_for_optimizer_attrs(OptimizerAttrs const &);
 
 /**
- * \brief Convert a FlexFlow::task_id_t into a %Realm task id.
+ * \brief Convert a FlexFlow::task_id_t into a %Realm task ID.
  *
  * \relates task_id_t
  */
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
index 02a0e95ba1..1105af4a92 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -7,7 +7,7 @@ features = [
   #"hash",
 ]
 docstring = '''
-\brief A simple container for mapping between DynamicValueAttrs and the corresponding realm instances/events.
+\brief A simple container for mapping between DynamicValueAttrs and the corresponding Realm instances (along with the ready event for each instance).
 
 \note The actual logic for doing instance allocation and destruction are in \ref perform_instance_allocation
 and \ref destroy_instances, respectively.
diff --git a/lib/realm-execution/index.dox b/lib/realm-execution/index.dox
index d43418f03f..b19f0ca905 100644
--- a/lib/realm-execution/index.dox
+++ b/lib/realm-execution/index.dox
@@ -3,7 +3,7 @@ namespace FlexFlow {
 
 \page realm-execution realm-execution
 
-\brief Executes distributed \ref MappedParallelComputationGraph ""s using realm, primarily by lowering them to distributed \ref DynamicOpenDataflowGraph ""s using \ref task-spec. Used both for testing and (eventually by \ref realm-execution) for fusing operator task launches.
+\brief Executes distributed \ref MappedParallelComputationGraph ""s using Realm, primarily by lowering them to distributed \ref DynamicOpenDataflowGraph ""s using \ref task-spec.
 
 The %Realm backend for distributed execution.
 
@@ -44,7 +44,7 @@ This is a single-controller implementation. That means the controller (the task
 
 \subsection term-controller controller
 
-The main thread/function that, in an non-controlled implementation, processes the task graph and dispatches all of the tasks.
+The main thread/function that, in an non-controlled-replicated implementation, processes the task graph and dispatches all of the tasks. In the future this will be extend to operate in a distributed fashion.
 
 */
 }
diff --git a/lib/substitutions/index.dox b/lib/substitutions/index.dox
index 120cc164d2..fbfa4af9dd 100644
--- a/lib/substitutions/index.dox
+++ b/lib/substitutions/index.dox
@@ -3,7 +3,7 @@ namespace FlexFlow {
 
 \page substitutions substitutions
 
-\brief Contains the definitions of pcg substitutions (i.e., Substitution), as well as the code for serializing them.
+\brief Contains the definitions of PCG substitutions (i.e., Substitution), as well as the code for serializing them.
 
 \section substitution Substitution
 
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/index.dox b/lib/task-spec/include/task-spec/dynamic_graph/index.dox
index e3259b5632..04ceaf4935 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/index.dox
+++ b/lib/task-spec/include/task-spec/dynamic_graph/index.dox
@@ -3,7 +3,7 @@ namespace FlexFlow {
 
 \page task-spec-dynamic-graph task-spec/dynamic_graph
 
-\brief Contains common code for inferring and making explicit information from a \ref MappedParallelComputationGraph, lowering it into a \ref DynamicOpenDataflowGraph that can be executed by \ref realm-execution and/or \ref local-execution.
+\brief Contains common code for inferring and making explicit information from a \ref MappedParallelComputationGraph or \ref ComputationGraph, lowering it into a \ref DynamicOpenDataflowGraph that can be executed by \ref realm-execution and/or \ref local-execution, respectively.
 
 \section task-spec-lowering-passes Lowering Passes
 

From d357b00c784c8678253b26ada1b2b327424abb61 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Tue, 10 Mar 2026 15:42:32 -0700
Subject: [PATCH 097/113] Incorporate remaining PR suggestions

---
 .../include/realm-execution/realm_manager.h            | 10 +++++++---
 .../include/realm-execution/tasks/impl/index.dox       |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index fec10acd50..65fcb83a3b 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -24,9 +24,13 @@ struct RealmManager : private RealmContext {
   RealmManager(RealmManager &&) = delete;
 
   /**
-   * @brief Launches the the \ref term-controller. Currently there is exactly one controller for the entire machine. The controller may be a function that closes over data (i.e., a lambda).
-   * 
-   * @warn If the provided function closes over data, **the user must block on the resulting event** to ensure it remains in scope until the controller completes.
+   * @brief Launches the the \ref term-controller. Currently there is exactly
+   * one controller for the entire machine. The controller may be a function
+   * that closes over data (i.e., a lambda).
+   *
+   * @warn If the provided function closes over data, **the user must block on
+   * the resulting event** to ensure it remains in scope until the controller
+   * completes.
    */
   [[nodiscard]] Realm::Event
       start_controller(std::function<void(RealmContext &)>,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
index 5c87839040..b36c36ab1d 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
@@ -12,7 +12,7 @@ wrapper function for spawning a task (e.g., \ref collective_spawn_controller_tas
 
 \subsection tasks-controller-tasks Controller Tasks
 
-Runs the \ref term-controller. At most one of these per machine (i.e., this task is a singleton in the object-oriented sense of the word). Implemented in \ref controller_task.h.
+Runs the \ref term-controller. The current implementation uses exactly one controller (i.e., centralized control), but the intention is to expand this in the future to distributed control (one controller per node, or one per device). Implemented in \ref controller_task.h.
 
 \subsection tasks-op-task Operator Tasks
 

From 982792d68e80f968d27b6e73d89ebea1df7f5749 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Tue, 10 Mar 2026 17:54:10 -0700
Subject: [PATCH 098/113] Pass tests

---
 docs/doxygen/Doxyfile                         |  2 +-
 index.dox                                     | 22 +++++++--------
 ...h => per_device_op_state_initialization.h} |  6 ++--
 .../computation_graph_instance.cc             |  6 ++--
 ... => per_device_op_state_initialization.cc} |  4 +--
 ...evice_handle.h => distributed_ff_handle.h} | 12 ++++----
 ...buted_per_device_op_state_initialization.h |  6 ++--
 .../dynamic_tensor_accessor_from_instance.h   |  2 +-
 .../realm-execution/instance_allocation.h     | 10 +++----
 .../include/realm-execution/pcg_instance.h    | 28 ++++++++++---------
 .../tasks/impl/ff_handle_init_return_task.h   |  6 ++--
 .../tasks/impl/ff_handle_init_task.h          |  6 ++--
 .../realm-execution/tasks/impl/index.dox      | 10 +++----
 .../realm-execution/tasks/impl/op_task.h      | 16 +++++------
 .../per_device_op_state_init_return_task.h    |  6 ++--
 .../impl/per_device_op_state_init_task.h      |  6 ++--
 ...er_device_op_state_init_task_args.dtg.toml |  2 +-
 ...er_device_op_state_init_task_args.dtg.toml |  2 +-
 ...zable_per_device_op_state_init_task_args.h | 12 ++++----
 .../tasks/realm_task_registry.h               |  6 ++--
 .../include/realm-execution/tasks/task_id_t.h |  4 +--
 lib/realm-execution/index.dox                 | 14 +++++-----
 ...ice_handle.cc => distributed_ff_handle.cc} | 16 +++++------
 ...ted_per_device_op_state_initialization.cc} | 12 ++++----
 .../src/realm-execution/pcg_instance.cc       | 16 +++++------
 .../src/realm-execution/realm_context.cc      |  2 +-
 .../serializable_ff_handle_init_task_args.cc  | 16 +++++------
 ...able_per_device_op_state_init_task_args.cc | 14 +++++-----
 .../tasks/realm_task_registry.cc              | 20 ++++++-------
 ...ice_handle.cc => distributed_ff_handle.cc} | 10 +++----
 .../test/src/realm-execution/realm_manager.cc |  2 +-
 .../test/src/realm-execution/test_e2e.cc      | 17 ++++++-----
 .../include/task-spec/per_device_op_state.h   |  2 +-
 .../src/task-spec/per_device_op_state.cc      |  2 +-
 34 files changed, 162 insertions(+), 155 deletions(-)
 rename lib/local-execution/include/local-execution/{device_state_initialization.h => per_device_op_state_initialization.h} (81%)
 rename lib/local-execution/src/local-execution/{device_state_initialization.cc => per_device_op_state_initialization.cc} (95%)
 rename lib/realm-execution/include/realm-execution/{distributed_device_handle.h => distributed_ff_handle.h} (84%)
 rename lib/realm-execution/src/realm-execution/{distributed_device_handle.cc => distributed_ff_handle.cc} (78%)
 rename lib/realm-execution/src/realm-execution/{distributed_per_device_op_state_initalization.cc => distributed_per_device_op_state_initialization.cc} (88%)
 rename lib/realm-execution/test/src/realm-execution/{distributed_device_handle.cc => distributed_ff_handle.cc} (87%)

diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 8fce95e656..52a62fc2f9 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -380,7 +380,7 @@ MARKDOWN_ID_STYLE      = GITHUB
 # globally by setting AUTOLINK_SUPPORT to NO.
 # The default value is: YES.
 
-AUTOLINK_SUPPORT       = YES
+AUTOLINK_SUPPORT       = NO
 
 # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
 # to include (a tag file for) the STL sources as input, then you should set this
diff --git a/index.dox b/index.dox
index 5417431a69..9ae2794d59 100644
--- a/index.dox
+++ b/index.dox
@@ -1,26 +1,26 @@
 /**
 
-\mainpage %FlexFlow %Train
+\mainpage FlexFlow Train
 
-\brief %FlexFlow Train is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies.
+\brief FlexFlow Train is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies.
 
 \section root-layout Project Layout
 
-The bulk of the %FlexFlow source code is stored in the following folders:
+The bulk of the FlexFlow source code is stored in the following folders:
 
-- \subpage lib "": The C++ code that makes up %FlexFlow's core, split up into a number of libraries. You can find a description of each library [here](./lib/README.md).
-- \subpage bin "": Command-line interfaces for %FlexFlow and associated tools (all in C++). Generally, these are just thin wrappers that parse command-line arguments and then call out to functions defined in \ref lib for the actual processing/logic. You can find a description of each binary \ref bin "here".
-- `bindings`: Python (or any additional languages added in the future) bindings for %FlexFlow %Train. Still mostly unimplemented.
+- \subpage lib "": The C++ code that makes up FlexFlow's core, split up into a number of libraries. You can find a description of each library [here](./lib/README.md).
+- \subpage bin "": Command-line interfaces for FlexFlow and associated tools (all in C++). Generally, these are just thin wrappers that parse command-line arguments and then call out to functions defined in \ref lib for the actual processing/logic. You can find a description of each binary \ref bin "here".
+- `bindings`: Python (or any additional languages added in the future) bindings for FlexFlow Train. Still mostly unimplemented.
 - `docs`: Config files for documentation generators and code for generating diagrams. The actual documentation itself is included in the source directories/files in <a href="https://www.doxygen.nl/manual/index.html">Doxygen</a> syntax either in standalone `.dox` files or inline in header files.
-- `cmake`: CMake configuration for building %FlexFlow %Train. Note that unless you're modifying the build configuration (i.e., adding a library, additional dependencies, etc.), you generally should use \ref contributing-proj "proj" instead of interacting with CMake directly.
+- `cmake`: CMake configuration for building FlexFlow Train. Note that unless you're modifying the build configuration (i.e., adding a library, additional dependencies, etc.), you generally should use \ref contributing-proj "proj" instead of interacting with CMake directly.
 
 \section root-contributing Contributing
 
 Please let us know if you encounter any bugs or have any suggestions by <a href="https://github.com/flexflow/flexflow-train/issues">submitting an issue</a>.
 
-For instructions on how to contribute code to %FlexFlow Train, see \subpage contributing.
+For instructions on how to contribute code to FlexFlow Train, see \subpage contributing.
 
-We welcome all contributions to %FlexFlow %Train from bug fixes to new features and extensions.
+We welcome all contributions to FlexFlow Train from bug fixes to new features and extensions.
 
 \section root-citations Citations
 
@@ -32,9 +32,9 @@ We welcome all contributions to %FlexFlow %Train from bug fixes to new features
 
 \section root-team The Team
 
-%FlexFlow %Train is developed and maintained by teams at CMU, Facebook, Los Alamos National Lab, MIT, Stanford, and UCSD (alphabetically).
+FlexFlow Train is developed and maintained by teams at CMU, Facebook, Los Alamos National Lab, MIT, Stanford, and UCSD (alphabetically).
 
 \section root-license License
-%FlexFlow %Train uses Apache License 2.0.
+FlexFlow Train uses Apache License 2.0.
 
 */
diff --git a/lib/local-execution/include/local-execution/device_state_initialization.h b/lib/local-execution/include/local-execution/per_device_op_state_initialization.h
similarity index 81%
rename from lib/local-execution/include/local-execution/device_state_initialization.h
rename to lib/local-execution/include/local-execution/per_device_op_state_initialization.h
index 6abd58a32c..abf24cdfd1 100644
--- a/lib/local-execution/include/local-execution/device_state_initialization.h
+++ b/lib/local-execution/include/local-execution/per_device_op_state_initialization.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_DEVICE_STATE_INITIALIZATION_H
-#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_DEVICE_STATE_INITIALIZATION_H
+#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_PER_DEVICE_OP_STATE_INITIALIZATION_H
+#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_PER_DEVICE_OP_STATE_INITIALIZATION_H
 
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
@@ -25,7 +25,7 @@ DynamicNodeInvocation
 /**
  * @brief Initialize all operators and save the per-device op state
  */
-DynamicOpenDataflowGraph perform_device_state_initialization(
+DynamicOpenDataflowGraph perform_per_device_op_state_initialization(
     DynamicOpenDataflowGraph const &,
     Allocator &allocator,
     ProfilingSettings const &profiling_settings,
diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
index 40d9b187c4..797ce36e5d 100644
--- a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
+++ b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
@@ -1,5 +1,5 @@
 #include "local-execution/computation_graph_instance/computation_graph_instance.h"
-#include "local-execution/device_state_initialization.h"
+#include "local-execution/per_device_op_state_initialization.h"
 #include "local-execution/task_execution.h"
 #include "local-execution/tensor_allocation.h"
 #include "pcg/optimizer_attrs.h"
@@ -96,7 +96,7 @@ ComputationGraphInstance create_computation_graph_instance(
         return get_loss_tensor_accessor(dg, lgv);
       });
 
-  dg = perform_device_state_initialization(dg,
+  dg = perform_per_device_op_state_initialization(dg,
                                            allocator,
                                            profiling_settings,
                                            device_handle,
@@ -134,7 +134,7 @@ static std::unordered_map<dynamic_layer_guid_t, std::optional<milliseconds_t>>
             /*per_device_op_state=*/
             transform(invocation.node_attrs.per_device_op_state,
                       [&](DeviceSpecificPerDeviceOpState const &op_state) {
-                        return get_device_state_from_device_specific(
+                        return get_per_device_op_state_from_device_specific(
                             op_state, device_idx);
                       }),
             /*iteration_config=*/iteration_config,
diff --git a/lib/local-execution/src/local-execution/device_state_initialization.cc b/lib/local-execution/src/local-execution/per_device_op_state_initialization.cc
similarity index 95%
rename from lib/local-execution/src/local-execution/device_state_initialization.cc
rename to lib/local-execution/src/local-execution/per_device_op_state_initialization.cc
index b5462b4b78..2cd53b428b 100644
--- a/lib/local-execution/src/local-execution/device_state_initialization.cc
+++ b/lib/local-execution/src/local-execution/per_device_op_state_initialization.cc
@@ -1,4 +1,4 @@
-#include "local-execution/device_state_initialization.h"
+#include "local-execution/per_device_op_state_initialization.h"
 #include "local-execution/local_task_registry.h"
 #include "local-execution/task_execution.h"
 #include "op-attrs/computation_graph_op_attrs.dtg.h"
@@ -57,7 +57,7 @@ DynamicNodeInvocation
   return result;
 }
 
-DynamicOpenDataflowGraph perform_device_state_initialization(
+DynamicOpenDataflowGraph perform_per_device_op_state_initialization(
     DynamicOpenDataflowGraph const &dg,
     Allocator &allocator,
     ProfilingSettings const &profiling_settings,
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
similarity index 84%
rename from lib/realm-execution/include/realm-execution/distributed_device_handle.h
rename to lib/realm-execution/include/realm-execution/distributed_ff_handle.h
index 20f170d42a..e581694c86 100644
--- a/lib/realm-execution/include/realm-execution/distributed_device_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
@@ -12,10 +12,10 @@ namespace FlexFlow {
  * \brief Tracks the \ref device_handle_t (i.e., FFHandle) for each %GPU, both local
  * and remote. %GPUs here are represented by \ref Realm::Processor ""s.
  */
-struct DistributedDeviceHandle {
+struct DistributedFfHandle {
 public:
-  DistributedDeviceHandle() = delete;
-  explicit DistributedDeviceHandle(
+  DistributedFfHandle() = delete;
+  explicit DistributedFfHandle(
       std::unordered_map<Realm::Processor,
                          DeviceSpecificManagedPerDeviceFFHandle> const
           &handles);
@@ -31,11 +31,11 @@ struct DistributedDeviceHandle {
 /**
  * \brief Launches tasks (using \ref spawn_ff_handle_init_task) to create
  * the \ref device_handle_t ""s for each %GPU and packages the results into a
- * DistributedDeviceHandle.
+ * DistributedFfHandle.
  *
- * \relates DistributedDeviceHandle
+ * \relates DistributedFfHandle
  */
-DistributedDeviceHandle create_distributed_device_handle(
+DistributedFfHandle create_distributed_ff_handle(
     RealmContext &ctx,
     size_t workSpaceSize,
     bool allowTensorOpMathConversion,
diff --git a/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h
index 1518a9d04b..0da97089ce 100644
--- a/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h
+++ b/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h
@@ -3,7 +3,7 @@
 
 #include "kernels/profiling_settings.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/distributed_ff_handle.h"
 #include "realm-execution/per_device_op_state_backing.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"
@@ -19,12 +19,12 @@ namespace FlexFlow {
  *
  * \relates PerDeviceOpStateBacking
  */
-PerDeviceOpStateBacking perform_distributed_device_state_initialization(
+PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
     RealmContext &ctx,
     DynamicOpenDataflowGraph const &dg,
     TensorInstanceBacking const &tensor_instance_backing,
     ProfilingSettings const &profiling_settings,
-    DistributedDeviceHandle const &device_handle,
+    DistributedFfHandle const &device_handle,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs,
     Realm::Event precondition);
diff --git a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
index 6891eca60d..bd304c5b4e 100644
--- a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
+++ b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
@@ -9,7 +9,7 @@
 namespace FlexFlow {
 
 /**
- * @brief Turn a %Realm region instance into a GenericTensorAccessor by
+ * @brief Turn a Realm region instance into a \ref GenericTensorAccessor by
  * re-wrapping the raw pointer.
  */
 DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
index a9dfb5d9c3..39f9848a87 100644
--- a/lib/realm-execution/include/realm-execution/instance_allocation.h
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -8,8 +8,8 @@
 namespace FlexFlow {
 
 /**
- * @brief Allocates a (potentially remote) %Realm instance for \param value
- * on the device represented by \param device_coord.
+ * @brief Allocates a (potentially remote) Realm instance for \p value
+ * on the device represented by \p device_coord.
  */
 std::pair<Realm::RegionInstance, Realm::Event>
     perform_instance_allocation_for_value(MachineSpaceCoordinate const &device_coord,
@@ -17,8 +17,8 @@ std::pair<Realm::RegionInstance, Realm::Event>
                                           RealmContext &ctx);
 
 /**
- * @brief Allocates the (potentially remote) %Realm instances for all of the
- * values in \param g, excluding the preallocated values in \param preallocated,
+ * @brief Allocates the (potentially remote) Realm instances for all of the
+ * values in \p g, excluding the preallocated values in \p preallocated,
  * using \ref perform_instance_allocation_for_value.
  *
  * \relates TensorInstanceBacking
@@ -30,7 +30,7 @@ TensorInstanceBacking perform_instance_allocation(
     RealmContext &ctx);
 
 /**
- * @brief Destroys all of the instances held in \param instances.
+ * @brief Destroys all of the instances held in \p instances.
  */
 void destroy_instances(TensorInstanceBacking const &instances,
                        Realm::Event precondition);
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance.h
index 50e0c4b623..e468bcfb97 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance.h
@@ -10,7 +10,7 @@
 #include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
-#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/distributed_ff_handle.h"
 #include "realm-execution/per_device_op_state_backing.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"
@@ -25,12 +25,13 @@ namespace FlexFlow {
 
 /**
  * \brief The main public interface for the Realm backend.
- * Takes a MappedParallelComputationGraph and lowers it through
- * DynamicOpenDataflowGraph to get the fully-specified execution order of tasks
- * to be issued. (Note: this is a parallel execution so execution order may not match the order in which operations are issued.) Also tracks the allocation of realm instances for tensors
- * through its TensorInstanceBacking.
+ * Takes a \ref MappedParallelComputationGraph and lowers it through
+ * \ref DynamicOpenDataflowGraph to get the fully-specified execution order of tasks
+ * to be issued. (Note: this is a parallel execution so execution order may not
+ * match the order in which operations are issued.) Also tracks the allocation
+ * of realm instances for tensors through its \ref TensorInstanceBacking.
  *
- * \note PCGInstance is primarily just a container for the various structs held
+ * \note \ref PCGInstance is primarily just a container for the various structs held
  * inside it. The actual initialization and training iteration functionality is
  * held in \ref create_pcg_instance and \ref
  * perform_update_pass_for_pcg_instance, respectively.
@@ -74,7 +75,8 @@ struct PCGInstance {
 };
 
 /**
- * \brief Creates a PCGInstance. Should generally be used instead of PCGInstance::PCGInstance.
+ * \brief Creates a \ref PCGInstance. Should generally be used instead of \ref
+ * PCGInstance::PCGInstance.
  *
  * \relates PCGInstance
  */
@@ -89,11 +91,11 @@ PCGInstance create_pcg_instance(
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
-    DistributedDeviceHandle const &device_handle,
+    DistributedFfHandle const &ff_handle,
     FFIterationConfig const &iteration_config);
 
 /**
- * \brief Dispatch a training iteration for a PCGInstance.
+ * \brief Dispatch a training iteration for a \ref PCGInstance.
  *
  * To dispatch just a piece of a training iteration, see the following functions:
  * - \ref perform_forward_pass_for_pcg_instance
@@ -106,28 +108,28 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_all_passes_for_pcg_instance(
         PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &ff_handle,
         FFIterationConfig iteration_config);
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_forward_pass_for_pcg_instance(
         PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &ff_handle,
         FFIterationConfig iteration_config);
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_backward_pass_for_pcg_instance(
         PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &ff_handle,
         FFIterationConfig iteration_config);
 
 std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_update_pass_for_pcg_instance(
         PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &ff_handle,
         FFIterationConfig iteration_config);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
index f7de2b1293..ae7f5c8691 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
@@ -8,8 +8,8 @@
 namespace FlexFlow {
 
 /**
- * \brief The function registered as a %Realm task for returning the
- * asynchronously-initialized FFHandle. Dispatched by \ref
+ * \brief The function registered as a Realm task for returning the
+ * asynchronously-initialized \ref FFHandle. Dispatched by \ref
  * spawn_ff_handle_init_return_task.
  *
  * To understand how this fits into the broader structure of \ref
@@ -20,7 +20,7 @@ void ff_handle_init_return_task_body(
 
 /**
  * \brief Launches the task (\ref ff_handle_init_return_task_body) for returning
- * the asynchronously-initialized FFHandle.
+ * the asynchronously-initialized \ref FFHandle.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
index 8588816576..ff87b1fa4d 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
@@ -8,8 +8,8 @@
 namespace FlexFlow {
 
 /**
- * \brief The function registered as a %Realm task for starting the asynchronous
- * initialization of the FFHandle. Dispatched by \ref
+ * \brief The function registered as a Realm task for starting the asynchronous
+ * initialization of the \ref FFHandle. Dispatched by \ref
  * spawn_ff_handle_init_task.
  *
  * To understand how this fits into the broader structure of \ref
@@ -20,7 +20,7 @@ void ff_handle_init_task_body(
 
 /**
  * \brief Launches the task (\ref ff_handle_init_return_task_body) for starting
- * the asynchronous initialization of the FFHandle.
+ * the asynchronous initialization of the \ref FFHandle.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
index b36c36ab1d..e527314346 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
@@ -7,8 +7,8 @@ namespace {
 
 \section tasks-one-part Individual Tasks
 
-Invidividual tasks are just normal %Realm tasks, which are implemented in \ref realm-execution as a
-wrapper function for spawning a task (e.g., \ref collective_spawn_controller_task) and a task body which is the actual %Realm task implementation (e.g., \ref controller_task_body). Each also has an optional corresponding <em>TaskArgument</em> (e.g., OpTaskArgs) object to provide a structure to the arguments passed from the wrapper to the task body. In cases where the %TaskArgument object is not trivially JSON-serializable, a corresponding JSON-serializable task argument type is provided (e.g., SerializeableOpTaskArgs).
+Invidividual tasks are just normal Realm tasks, which are implemented in \ref realm-execution as a
+wrapper function for spawning a task (e.g., \ref collective_spawn_controller_task) and a task body which is the actual Realm task implementation (e.g., \ref controller_task_body). Each also has an optional corresponding <em>TaskArgument</em> (e.g., \ref OpTaskArgs) object to provide a structure to the arguments passed from the wrapper to the task body. In cases where the %TaskArgument object is not trivially JSON-serializable, a corresponding JSON-serializable task argument type is provided (e.g., \ref SerializeableOpTaskArgs).
 
 \subsection tasks-controller-tasks Controller Tasks
 
@@ -22,15 +22,15 @@ Implements all of the operator tasks, i.e., the tasks that are executed during t
 
 The other two types of tasks are implemented as pairs of tasks: one to begin initializing a value (e.g., \ref spawn_ff_handle_init_task), and another to return the initialized value when it's ready (e.g., \ref spawn_ff_handle_init_return_task). As with \ref task-one-part, they have an optional corresponding tasks argument type and a potential serializable task argument type.
 
-\todo \@Elliott why is the paired tasks structure required? Is it a performance optimization, or simply necessary given the set of primitives %Realm provides?
+\todo \@Elliott why is the paired tasks structure required? Is it a performance optimization, or simply necessary given the set of primitives Realm provides?
 
 \subsection tasks-ffhandle-init FFHandle Initialization Tasks
 
-For initializing the FFHandle for each %GPU. Implemented in \ref ff_handle_init_task.h and \ref ff_handle_init_return_task.h.
+For initializing the \ref FFHandle for each GPU. Implemented in \ref ff_handle_init_task.h and \ref ff_handle_init_return_task.h.
 
 \subsection tasks-op-state-init PerDeviceOpState Initialization Tasks
 
-For initializing the PerDeviceOpState for each shard of an operator task. Implemented in \ref per_device_op_state_init_task.h and \ref per_device_op_state_init_return_task.h.
+For initializing the \ref PerDeviceOpState for each shard of an operator task. Implemented in \ref per_device_op_state_init_task.h and \ref per_device_op_state_init_return_task.h.
 
 */
 }
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 533df13380..6a0ac53053 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -17,18 +17,18 @@
 namespace FlexFlow {
 
 /**
- * \brief The function registered as a %Realm task for operator-related tasks.
+ * \brief The function registered as a Realm task for operator-related tasks.
  * Dispatched by \ref spawn_op_task.
  */
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Launches the task (\ref op_task_body), for a DynamicNodeInvocation
- * using %Realm.
+ * \brief Launches the task (\ref op_task_body), for a \ref DynamicNodeInvocation
+ * using Realm.
  *
  * The task launch process functions a bit differently to that used in the
  * previous FlexFlow codebase. Rather than having a function registered with
- * realm/legion for every task_id_t, we now have only a few functions
+ * realm/legion for every \ref task_id_t, we now have only a few functions
  * registered: \ref op_task_body, \ref ff_handle_init_task_body,
  * \ref per_device_op_state_init_return_task_body, and \ref controller_task_body (see
  * \ref register_all_tasks for where this list comes from), and in fact only
@@ -36,17 +36,17 @@ void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
  * tasks use the serialized arguments sent to them to dispatch to the correct
  * implementatin in task-spec: for example, if we are trying to launch the task
  * for a Conv2d operator, this function will actually dispatch a call to \ref
- * op_task_body with a serialized OpTaskArgs as an argument, and then \ref
+ * op_task_body with a serialized \ref OpTaskArgs as an argument, and then \ref
  * op_task_body will deserialize the argument, determine that we are trying to
  * launch the forward pass of Conv2d, use \ref execute_dynamic_node_invocation
  * (which then uses \ref call_fwd_task_impl) to actually call the function in
  * lib/task-spec/src/task-spec/ops/impl/conv_2d.cc
  *
  * The above also means that we don't have a separate
- * ITaskArgumentAccessor subclass for realm-execution. Instead we ship over the
+ * \ref ITaskArgumentAccessor subclass for realm-execution. Instead we ship over the
  * information on the corresponding realm instances over to the remote node,
- * grab the corresponding pointer/GenericTensorAccessor, and then use
- * LocalTaskArgumentAccessor for the actual argument access as, by this point,
+ * grab the corresponding pointer/\ref GenericTensorAccessor, and then use
+ * \ref LocalTaskArgumentAccessor for the actual argument access as, by this point,
  * everything is local.
  *
  * To understand how this fits into the broader structure of \ref
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
index 5d8b0d4beb..bbe640a376 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
@@ -9,8 +9,8 @@
 namespace FlexFlow {
 
 /**
- * \brief The function registered as a %Realm task for returning the
- * asynchronously-initialized PerDeviceOpState. Dispatched by \ref
+ * \brief The function registered as a Realm task for returning the
+ * asynchronously-initialized \ref PerDeviceOpState. Dispatched by \ref
  * spawn_per_device_op_state_init_return_task.
  *
  * To understand how this fits into the broader structure of \ref
@@ -21,7 +21,7 @@ void per_device_op_state_init_return_task_body(
 
 /**
  * \brief Launches the task (\ref per_device_op_state_init_return_task_body) for returning
- * the asynchronously-initialized PerDeviceOpState.
+ * the asynchronously-initialized \ref PerDeviceOpState.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
index ac2f7d7c1d..1c4675da2a 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
@@ -15,8 +15,8 @@
 namespace FlexFlow {
 
 /**
- * \brief The function registered as a %Realm task for starting the asynchronous
- * initialization of the PerDeviceOpState. Dispatched by \ref
+ * \brief The function registered as a Realm task for starting the asynchronous
+ * initialization of the \ref PerDeviceOpState. Dispatched by \ref
  * spawn_per_device_op_state_init_task.
  *
  * To understand how this fits into the broader structure of \ref
@@ -27,7 +27,7 @@ void per_device_op_state_init_task_body(
 
 /**
  * \brief Launches the task (\ref per_device_op_state_init_task_body) for starting
- * the asynchronous initialization of the PerDeviceOpState.
+ * the asynchronous initialization of the \ref PerDeviceOpState.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
index 9a7c2781d2..57012ce716 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "DeviceStateInitTaskArgs"
+name = "PerDeviceOpStateInitTaskArgs"
 type = "struct"
 features = []
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml
index 034132f9d1..0e53767862 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "SerializableDeviceStateInitTaskArgs"
+name = "SerializablePerDeviceOpStateInitTaskArgs"
 type = "struct"
 features = [
   "eq",
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
index f028820974..33bf1abd96 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
@@ -1,15 +1,15 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_ARGS_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_ARGS_H
 
-#include "realm-execution/tasks/impl/device_state_init_task_args.dtg.h"
-#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.h"
 
 namespace FlexFlow {
 
-SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
-    DeviceStateInitTaskArgs const &);
-DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
-    SerializableDeviceStateInitTaskArgs const &);
+SerializablePerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_to_serializable(
+    PerDeviceOpStateInitTaskArgs const &);
+PerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_from_serializable(
+    SerializablePerDeviceOpStateInitTaskArgs const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
index 6b0cc624b4..8ff08abe49 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -9,7 +9,8 @@ namespace FlexFlow {
 /**
  * \relates task_id_t
  *
- * \warning The event returned by this function <em>must be consumed</em> or else %Realm may not shut down properly.
+ * \warning The event returned by this function <em>must be consumed</em> or
+ * else Realm may not shut down properly.
  */
 [[nodiscard]] Realm::Event register_task(Realm::Processor::Kind target_kind,
                                          task_id_t func_id,
@@ -22,7 +23,8 @@ namespace FlexFlow {
 /**
  * \brief Registers all known tasks (using \ref register_task).
  *
- * \warning The event returned by this function <em>must be consumed</em> or else %Realm may not shut down properly.
+ * \warning The event returned by this function <em>must be consumed</em> or
+ * else Realm may not shut down properly.
  */
 [[nodiscard]] Realm::Event register_all_tasks();
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
index e8858d4451..b307492d6b 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
@@ -11,7 +11,7 @@
 namespace FlexFlow {
 
 /**
- * \brief Retrieves the \ref task_id_t for a DynamicNodeAttrs, with
+ * \brief Retrieves the \ref task_id_t for a \ref DynamicNodeAttrs, with
  * a return value of \ref std::nullopt to be treated as a no-op task.
  */
 std::optional<task_id_t>
@@ -29,7 +29,7 @@ std::optional<task_id_t>
     get_update_task_id_for_optimizer_attrs(OptimizerAttrs const &);
 
 /**
- * \brief Convert a FlexFlow::task_id_t into a %Realm task ID.
+ * \brief Convert a \ref FlexFlow::task_id_t into a Realm task ID.
  *
  * \relates task_id_t
  */
diff --git a/lib/realm-execution/index.dox b/lib/realm-execution/index.dox
index b19f0ca905..ea1545987f 100644
--- a/lib/realm-execution/index.dox
+++ b/lib/realm-execution/index.dox
@@ -5,7 +5,7 @@ namespace FlexFlow {
 
 \brief Executes distributed \ref MappedParallelComputationGraph ""s using Realm, primarily by lowering them to distributed \ref DynamicOpenDataflowGraph ""s using \ref task-spec.
 
-The %Realm backend for distributed execution.
+The Realm backend for distributed execution.
 
 This is a single-controller implementation. That means the controller (the task that launches all other work) runs on a single node and remotely launches work onto other nodes. Aside from caveats mentioned below, this implementation is (mostly) capable of distributed execution.
 
@@ -20,14 +20,14 @@ This is a single-controller implementation. That means the controller (the task
 - \ref RealmContext "": \copybrief RealmContext
 - \subpage realm-execution-tasks "include/realm-execution/tasks": The Realm task implementations and their supporting infrastructure.
   - \ref "lib/realm-execution/include/realm-execution/tasks/impl" "impl/": the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
-  - \ref "lib/realm-execution/include/realm-execution/tasks/serializer/" "serializer/": additional support for serializing %Realm data types.
-  - \ref realm_task_registry.h: Manages the registration of %Realm tasks. All %Realm tasks go through this interface.
-  - \ref task_id_t.h and \ref realm_task_id_t.h: Types to represent %Realm tasks, along with an encoding to %Realm's native task ID type.
-- Helper components (mainly used within PCGInstance)
+  - \ref "lib/realm-execution/include/realm-execution/tasks/serializer/" "serializer/": additional support for serializing Realm data types.
+  - \ref realm_task_registry.h: Manages the registration of Realm tasks. All Realm tasks go through this interface.
+  - \ref task_id_t.h and \ref realm_task_id_t.h: Types to represent Realm tasks, along with an encoding to Realm's native task ID type.
+- Helper components (mainly used within \ref PCGInstance)
   - \ref DistributedDeviceHandle "": represents a distributed device handle (i.e., device handles on all the GPUs on the system), for convenience.
   - \ref DependencySet "": tracks dependencies during execution of tasks.
-  - \ref "distributed_device_state_initialization.h": performs device state initialization of dynamic graph nodes and returns the resulting PerDeviceOpStateBacking.
-  - \ref "instance_allocation.h": allocates instances for tensors in the dynamic graph and returns the resulting TensorInstanceBacking.
+  - \ref "distributed_device_state_initialization.h": performs device state initialization of dynamic graph nodes and returns the resulting \ref PerDeviceOpStateBacking.
+  - \ref "instance_allocation.h": allocates instances for tensors in the dynamic graph and returns the resulting \ref TensorInstanceBacking.
 
 \section realm-execution-todo Outstanding TODOs
 
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
similarity index 78%
rename from lib/realm-execution/src/realm-execution/distributed_device_handle.cc
rename to lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
index 87376be9b1..185d7e0c89 100644
--- a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
@@ -1,22 +1,22 @@
-#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/distributed_ff_handle.h"
 #include "realm-execution/device_specific_managed_per_device_ff_handle.h"
-#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_task.h"
 #include "task-spec/device_specific.h"
 
 namespace FlexFlow {
 
-DistributedDeviceHandle::DistributedDeviceHandle(
+DistributedFfHandle::DistributedFfHandle(
     std::unordered_map<Realm::Processor,
                        DeviceSpecificManagedPerDeviceFFHandle> const &handles)
     : handles(handles) {}
 
 DeviceSpecificManagedPerDeviceFFHandle const &
-    DistributedDeviceHandle::at(Realm::Processor processor) const {
+    DistributedFfHandle::at(Realm::Processor processor) const {
   return this->handles.at(processor);
 }
 
-DistributedDeviceHandle
-    create_distributed_device_handle(RealmContext &ctx,
+DistributedFfHandle
+    create_distributed_ff_handle(RealmContext &ctx,
                                      size_t workSpaceSize,
                                      bool allowTensorOpMathConversion,
                                      Realm::Event precondition) {
@@ -35,7 +35,7 @@ DistributedDeviceHandle
   }
 
   for (auto &[proc, handle] : handles) {
-    spawn_device_handle_init_task(ctx,
+    spawn_ff_handle_init_task(ctx,
                                   proc,
                                   workSpaceSize,
                                   allowTensorOpMathConversion,
@@ -45,7 +45,7 @@ DistributedDeviceHandle
 
   ctx.get_outstanding_events().wait();
 
-  return DistributedDeviceHandle{handles};
+  return DistributedFfHandle{handles};
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initalization.cc b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
similarity index 88%
rename from lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initalization.cc
rename to lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
index 8fdc9a9784..8612fa4b97 100644
--- a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initalization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
@@ -1,6 +1,6 @@
-#include "realm-execution/distributed_per_device_op_state_initalization.h"
-#include "local-execution/device_state_initialization.h"
-#include "realm-execution/tasks/impl/device_state_init_task.h"
+#include "realm-execution/distributed_per_device_op_state_initialization.h"
+#include "local-execution/per_device_op_state_initialization.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_task.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"
 #include "realm-execution/tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
@@ -16,12 +16,12 @@
 
 namespace FlexFlow {
 
-PerDeviceOpStateBacking perform_distributed_device_state_initialization(
+PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
     RealmContext &ctx,
     DynamicOpenDataflowGraph const &dg,
     TensorInstanceBacking const &tensor_instance_backing,
     ProfilingSettings const &profiling_settings,
-    DistributedDeviceHandle const &device_handle,
+    DistributedFfHandle const &device_handle,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs,
     Realm::Event precondition) {
@@ -45,7 +45,7 @@ PerDeviceOpStateBacking perform_distributed_device_state_initialization(
                                                 std::nullopt};
 
     std::optional<Realm::Event> completion_event =
-        spawn_device_state_init_task(ctx,
+        spawn_per_device_op_state_init_task(ctx,
                                      target_proc,
                                      invocation,
                                      tensor_backing,
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance.cc
index ead569f4ce..60d96eca49 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance.cc
@@ -2,7 +2,7 @@
 #include "op-attrs/tensor_slot_name.dtg.h"
 #include "pcg/optimizer_attrs.h"
 #include "realm-execution/dependency_set.h"
-#include "realm-execution/distributed_device_state_initialization.h"
+#include "realm-execution/distributed_per_device_op_state_initialization.h"
 #include "realm-execution/instance_allocation.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tasks/impl/op_task.h"
@@ -86,7 +86,7 @@ PCGInstance create_pcg_instance(
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
-    DistributedDeviceHandle const &device_handle,
+    DistributedFfHandle const &device_handle,
     FFIterationConfig const &iteration_config) {
 
   DynamicOpenDataflowGraph dg =
@@ -136,7 +136,7 @@ PCGInstance create_pcg_instance(
       });
 
   PerDeviceOpStateBacking device_state_backing =
-      perform_distributed_device_state_initialization(
+      perform_distributed_per_device_op_state_initialization(
           ctx,
           dg,
           tensor_instance_backing,
@@ -169,7 +169,7 @@ static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
         PerDeviceOpStateBacking const &device_state_backing,
         OptimizerAttrs const &optimizer_attrs,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &device_handle,
         FFIterationConfig iteration_config) {
   // For simplicity we'll track a dependency on all outstanding operations up to
   // this point. This will create an effective barrier between phases.
@@ -229,7 +229,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_all_passes_for_pcg_instance(
         PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
       pcg_instance.get_execution_order();
@@ -252,7 +252,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_forward_pass_for_pcg_instance(
         PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
       filter(pcg_instance.get_execution_order(),
@@ -277,7 +277,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_backward_pass_for_pcg_instance(
         PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
       filter(pcg_instance.get_execution_order(),
@@ -302,7 +302,7 @@ std::unordered_map<dynamic_layer_guid_t, Realm::Event>
     perform_update_pass_for_pcg_instance(
         PCGInstance &pcg_instance,
         ProfilingSettings const &profiling_settings,
-        DistributedDeviceHandle const &device_handle,
+        DistributedFfHandle const &device_handle,
         FFIterationConfig iteration_config) {
   std::vector<DynamicNodeInvocation> execution_order =
       filter(pcg_instance.get_execution_order(),
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 10ed07118b..4e981e7414 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -6,8 +6,8 @@
 #include "pcg/device_id_t.h"
 #include "pcg/device_type.dtg.h"
 #include "realm-execution/realm_allocator.h"
-#include "realm-execution/tasks/realm_task_id_t.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
+#include "realm-execution/tasks/task_id_t.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/transform.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
index a44a5a5db1..5f0e8e5a7f 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
@@ -1,11 +1,11 @@
-#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.h"
+#include "realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h"
 
 namespace FlexFlow {
 
-SerializableDeviceHandleInitTaskArgs
-    device_handle_init_task_args_to_serializable(
-        DeviceHandleInitTaskArgs const &args) {
-  return SerializableDeviceHandleInitTaskArgs{
+SerializableFfHandleInitTaskArgs
+    ff_handle_init_task_args_to_serializable(
+        FfHandleInitTaskArgs const &args) {
+  return SerializableFfHandleInitTaskArgs{
       /*workSpaceSize=*/args.workSpaceSize,
       /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
       /*origin_proc=*/realm_processor_to_serializable(args.origin_proc),
@@ -13,9 +13,9 @@ SerializableDeviceHandleInitTaskArgs
   };
 }
 
-DeviceHandleInitTaskArgs device_handle_init_task_args_from_serializable(
-    SerializableDeviceHandleInitTaskArgs const &args) {
-  return DeviceHandleInitTaskArgs{
+FfHandleInitTaskArgs ff_handle_init_task_args_from_serializable(
+    SerializableFfHandleInitTaskArgs const &args) {
+  return FfHandleInitTaskArgs{
       /*workSpaceSize=*/args.workSpaceSize,
       /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
       /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
index 2e7e02b529..fc30837325 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
@@ -1,4 +1,4 @@
-#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
+#include "realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h"
 #include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
 #include "realm-execution/tasks/serializer/serializable_realm_processor.h"
 #include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
@@ -6,9 +6,9 @@
 
 namespace FlexFlow {
 
-SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
-    DeviceStateInitTaskArgs const &args) {
-  return SerializableDeviceStateInitTaskArgs{
+SerializablePerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_to_serializable(
+    PerDeviceOpStateInitTaskArgs const &args) {
+  return SerializablePerDeviceOpStateInitTaskArgs{
       /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
       /*tensor_backing*/
       tensor_instance_backing_to_serializable(args.tensor_backing),
@@ -21,9 +21,9 @@ SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
   };
 }
 
-DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
-    SerializableDeviceStateInitTaskArgs const &args) {
-  return DeviceStateInitTaskArgs{
+PerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_from_serializable(
+    SerializablePerDeviceOpStateInitTaskArgs const &args) {
+  return PerDeviceOpStateInitTaskArgs{
       /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
       /*tensor_backing*/
       tensor_instance_backing_from_serializable(args.tensor_backing),
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index 0372dcdfeb..69d5a163c8 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -1,9 +1,9 @@
 #include "realm-execution/tasks/realm_task_registry.h"
 #include "realm-execution/tasks/impl/controller_task.h"
-#include "realm-execution/tasks/impl/device_handle_init_return_task.h"
-#include "realm-execution/tasks/impl/device_handle_init_task.h"
-#include "realm-execution/tasks/impl/device_state_init_return_task.h"
-#include "realm-execution/tasks/impl/device_state_init_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_return_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_task.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_return_task.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_task.h"
 #include "realm-execution/tasks/impl/op_task.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "utils/exception.h"
@@ -55,9 +55,9 @@ Realm::Event register_all_tasks() {
 
   for (task_id_t task_id : init_task_ids) {
     pending_registrations.push_back(register_task(
-        Realm::Processor::LOC_PROC, task_id, device_state_init_task_body));
+        Realm::Processor::LOC_PROC, task_id, per_device_op_state_init_task_body));
     pending_registrations.push_back(register_task(
-        Realm::Processor::TOC_PROC, task_id, device_state_init_task_body));
+        Realm::Processor::TOC_PROC, task_id, per_device_op_state_init_task_body));
   }
 
   std::vector<task_id_t> task_ids = {
@@ -140,19 +140,19 @@ Realm::Event register_all_tasks() {
   pending_registrations.push_back(
       register_task(Realm::Processor::LOC_PROC,
                     task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
-                    device_handle_init_task_body));
+                    ff_handle_init_task_body));
   pending_registrations.push_back(
       register_task(Realm::Processor::TOC_PROC,
                     task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
-                    device_handle_init_task_body));
+                    ff_handle_init_task_body));
   pending_registrations.push_back(
       register_task(Realm::Processor::LOC_PROC,
                     task_id_t::DEVICE_HANDLE_INIT_RETURN_TASK_ID,
-                    device_handle_init_return_task_body));
+                    ff_handle_init_return_task_body));
   pending_registrations.push_back(
       register_task(Realm::Processor::LOC_PROC,
                     task_id_t::DEVICE_STATE_INIT_RETURN_TASK_ID,
-                    device_state_init_return_task_body));
+                    per_device_op_state_init_return_task_body));
   return Realm::Event::merge_events(pending_registrations);
 }
 
diff --git a/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/test/src/realm-execution/distributed_ff_handle.cc
similarity index 87%
rename from lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
rename to lib/realm-execution/test/src/realm-execution/distributed_ff_handle.cc
index aaefe337db..8ce5d3ed6e 100644
--- a/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
+++ b/lib/realm-execution/test/src/realm-execution/distributed_ff_handle.cc
@@ -1,4 +1,4 @@
-#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/distributed_ff_handle.h"
 #include "internal/realm_test_utils.h"
 #include "realm-execution/realm_manager.h"
 #include <doctest/doctest.h>
@@ -9,7 +9,7 @@ using namespace ::FlexFlow;
 namespace Realm = ::FlexFlow::Realm;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("DistributedDeviceHandle") {
+  TEST_CASE("DistributedFfHandle") {
     std::vector<char *> fake_args =
         make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
     int fake_argc = fake_args.size();
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     RealmManager manager(&fake_argc, &fake_argv);
 
     (void)manager.start_controller([](RealmContext &ctx) {
-      DistributedDeviceHandle handle = create_distributed_device_handle(
+      DistributedFfHandle handle = create_distributed_ff_handle(
           /*ctx=*/ctx,
           /*workSpaceSize=*/1024 * 1024,
           /*allowTensorOpMathConversion=*/true);
@@ -35,7 +35,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 }
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("DistributedDeviceHandle (GPU)") {
+  TEST_CASE("DistributedFfHandle (GPU)") {
     std::vector<char *> fake_args =
         make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/1_n);
     int fake_argc = fake_args.size();
@@ -44,7 +44,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     RealmManager manager(&fake_argc, &fake_argv);
 
     (void)manager.start_controller([](RealmContext &ctx) {
-      DistributedDeviceHandle handle = create_distributed_device_handle(
+      DistributedFfHandle handle = create_distributed_ff_handle(
           /*ctx=*/ctx,
           /*workSpaceSize=*/1024 * 1024,
           /*allowTensorOpMathConversion=*/true);
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index 450d7fd3ec..4063ec32f2 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -1,6 +1,6 @@
 #include "realm-execution/realm_manager.h"
 #include "internal/realm_test_utils.h"
-#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/distributed_ff_handle.h"
 #include <doctest/doctest.h>
 
 namespace test {
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 11d2f14f1c..d9b50b5ea7 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -13,9 +13,9 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
-#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/distributed_ff_handle.h"
 #include "realm-execution/dynamic_tensor_accessor_from_instance.h"
-#include "realm-execution/pcg_instance/pcg_instance.h"
+#include "realm-execution/pcg_instance.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/realm_manager.h"
 #include "task-spec/permissions.h"
@@ -205,7 +205,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
           input_tensors;
 
-      DistributedDeviceHandle device_handle = create_distributed_device_handle(
+      DistributedFfHandle device_handle = create_distributed_ff_handle(
           ctx,
           /*workSpaceSize=*/1024 * 1024,
           /*allowTensorOpMathConversion=*/true);
@@ -278,8 +278,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     TensorShape label_tensor_shape = TensorShape{
         TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
-    GenericTensorAccessorW label_tensor =
-        allocator.allocate_tensor(label_tensor_shape);
 
     TensorShape weight_shape_1 = TensorShape{
         TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
@@ -422,16 +420,19 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     RealmManager manager(&fake_argc, &fake_argv);
 
-    (void)manager.start_controller([](RealmContext &ctx) {
+    Realm::Event e = manager.start_controller([&](RealmContext &ctx) {
       Allocator allocator = ctx.get_current_device_allocator();
 
       GenericTensorAccessorW label_tensor_backing =
           allocator.allocate_tensor(output_tensor_shape);
 
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
       std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
           input_tensors;
 
-      DistributedDeviceHandle device_handle = create_distributed_device_handle(
+      DistributedFfHandle device_handle = create_distributed_ff_handle(
           ctx,
           /*workSpaceSize=*/1024 * 1024,
           /*allowTensorOpMathConversion=*/true);
@@ -483,6 +484,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           check_kv("last_epoch_loss",
                    format_accessor_r_contents(last_epoch_loss)));
     });
+
+    e.wait();
 //! [realm-execution example]
   }
 }
diff --git a/lib/task-spec/include/task-spec/per_device_op_state.h b/lib/task-spec/include/task-spec/per_device_op_state.h
index 68d3f98ebf..8783f902e4 100644
--- a/lib/task-spec/include/task-spec/per_device_op_state.h
+++ b/lib/task-spec/include/task-spec/per_device_op_state.h
@@ -8,7 +8,7 @@
 
 namespace FlexFlow {
 
-PerDeviceOpState get_device_state_from_device_specific(
+PerDeviceOpState get_per_device_op_state_from_device_specific(
     DeviceSpecificPerDeviceOpState const &, device_id_t device_idx);
 
 }
diff --git a/lib/task-spec/src/task-spec/per_device_op_state.cc b/lib/task-spec/src/task-spec/per_device_op_state.cc
index 12b649e663..438cd8886c 100644
--- a/lib/task-spec/src/task-spec/per_device_op_state.cc
+++ b/lib/task-spec/src/task-spec/per_device_op_state.cc
@@ -3,7 +3,7 @@
 
 namespace FlexFlow {
 
-PerDeviceOpState get_device_state_from_device_specific(
+PerDeviceOpState get_per_device_op_state_from_device_specific(
     DeviceSpecificPerDeviceOpState const &device_specific,
     device_id_t device_idx) {
   return device_specific.visit<PerDeviceOpState>(

From 45609bc7294f40d20548c6508a0145690a0f1cd2 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 11 Mar 2026 03:35:51 -0700
Subject: [PATCH 099/113] Resolve all outstanding doxygen errors, add doxygen
 check to CI

---
 .clang-format-for-format-sh                   |  2 +-
 .proj.toml                                    |  1 +
 .../src/export-model-arch/main.cc             |  4 +-
 contributing.dox                              | 25 +++----
 docs/doxygen/Doxyfile                         |  5 +-
 flake.lock                                    |  8 +-
 index.dox                                     |  4 +-
 ..._substitution_and_update_machine_mapping.h |  2 +-
 .../compiler/task_graph_simulator/index.dox   | 10 +++
 lib/index.dox                                 |  2 +-
 lib/kernels/src/kernels/accessor.cc           |  2 +
 .../computation_graph_instance.cc             | 12 +--
 .../mapped_per_device_op_states_group.cc      |  2 +-
 lib/models/include/models/bert/index.dox      | 10 +++
 .../include/models/candle_uno/candle_uno.h    |  3 +-
 .../include/models/candle_uno/index.dox       | 10 +++
 lib/models/include/models/dlrm/index.dox      | 10 +++
 .../include/models/inception_v3/index.dox     | 10 +++
 .../include/models/split_test/index.dox       |  5 +-
 .../include/models/transformer/index.dox      | 10 +++
 .../include/models/transformer/transformer.h  |  3 +-
 lib/models/index.dox                          | 10 +--
 .../src/models/split_test/split_test.cc       |  4 +-
 .../initializers/uniform_initializer_attrs.h  |  2 +
 lib/op-attrs/include/op-attrs/ops/index.dox   | 11 +--
 lib/op-attrs/index.dox                        |  9 ++-
 .../initializers/uniform_initializer_attrs.cc |  2 +
 .../include/pcg/computation_graph_builder.h   |  5 +-
 .../v1/v1_binary_sp_decomposition/json.cc     |  2 +
 .../realm-execution/distributed_ff_handle.h   |  4 +-
 .../realm-execution/instance_allocation.h     |  7 +-
 .../include/realm-execution/pcg_instance.h    | 18 +++--
 .../include/realm-execution/realm_context.h   |  5 +-
 .../include/realm-execution/realm_manager.h   |  8 +-
 .../tasks/impl/controller_task.h              |  9 ++-
 .../tasks/impl/ff_handle_init_return_task.h   |  4 +-
 .../tasks/impl/ff_handle_init_task.h          |  4 +-
 .../realm-execution/tasks/impl/index.dox      |  8 +-
 .../realm-execution/tasks/impl/op_task.h      | 34 ++++-----
 .../per_device_op_state_init_return_task.h    |  4 +-
 .../impl/per_device_op_state_init_task.h      |  4 +-
 .../serializable_ff_handle_init_task_args.h   |  3 +-
 ...zable_per_device_op_state_init_task_args.h | 10 ++-
 .../tasks/realm_task_registry.h               |  2 +-
 .../include/realm-execution/tasks/task_id_t.h |  7 +-
 lib/realm-execution/index.dox                 | 10 +--
 .../realm-execution/distributed_ff_handle.cc  | 16 ++--
 ...uted_per_device_op_state_initialization.cc | 18 ++---
 .../realm-execution/instance_allocation.cc    | 11 +--
 .../tasks/impl/ff_handle_init_return_task.cc  | 11 ++-
 .../tasks/impl/ff_handle_init_task.cc         | 37 +++++-----
 .../src/realm-execution/tasks/impl/op_task.cc | 14 ++--
 .../per_device_op_state_init_return_task.cc   |  8 +-
 .../impl/per_device_op_state_init_task.cc     | 20 ++---
 .../serializable_ff_handle_init_task_args.cc  |  3 +-
 ...able_per_device_op_state_init_task_args.cc | 10 ++-
 .../tasks/realm_task_registry.cc              | 14 ++--
 .../test/src/realm-execution/test_e2e.cc      | 21 +++---
 lib/substitutions/index.dox                   |  4 +-
 .../task-spec/fwd_bwd_op_task_impl_function.h |  2 +
 .../task-spec/generic_task_impl_function.h    |  2 +
 .../task_argument_accessor/index.dox          |  2 +-
 .../fwd_bwd_op_task_impl_function.cc          |  3 +
 .../task-spec/generic_task_impl_function.cc   |  3 +
 .../task-spec/init_op_task_impl_function.cc   |  2 +
 .../utils/any_value_type/any_value_type.h     |  4 +
 lib/utils/include/utils/graph/index.dox       | 34 ++++-----
 .../utils/any_value_type/any_value_type.cc    |  2 +
 .../open_dataflow_graph/algorithms/as_dot.cc  |  3 +-
 .../graph/series_parallel/graph_generation.cc |  3 +-
 lib/utils/src/utils/graph/traversal.cc        | 73 +++++++++----------
 lib/utils/src/utils/half.cc                   |  2 +
 72 files changed, 370 insertions(+), 278 deletions(-)
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/index.dox
 create mode 100644 lib/models/include/models/bert/index.dox
 create mode 100644 lib/models/include/models/candle_uno/index.dox
 create mode 100644 lib/models/include/models/dlrm/index.dox
 create mode 100644 lib/models/include/models/inception_v3/index.dox
 create mode 100644 lib/models/include/models/transformer/index.dox

diff --git a/.clang-format-for-format-sh b/.clang-format-for-format-sh
index 17e9f8935d..0da534af8b 100644
--- a/.clang-format-for-format-sh
+++ b/.clang-format-for-format-sh
@@ -133,7 +133,7 @@ PointerAlignment: Right
 PPIndentWidth:   -1
 QualifierAlignment: Right
 ReferenceAlignment: Pointer
-ReflowComments:  true
+ReflowComments:  false
 ShortNamespaceLines: 1
 SortIncludes:    CaseSensitive
 SortJavaStaticImport: Before
diff --git a/.proj.toml b/.proj.toml
index 5dbbfbcdd7..3d78c9ae82 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -2,6 +2,7 @@ project_name = "flexflow"
 testsuite_macro = "FF_TEST_SUITE"
 namespace_name = "FlexFlow"
 header_extension = ".h"
+doxygen = true
 cuda_launch_cmd = [
   "nixGL",
   "--",
diff --git a/bin/export-model-arch/src/export-model-arch/main.cc b/bin/export-model-arch/src/export-model-arch/main.cc
index 0c2cfbdb6b..e62809dda5 100644
--- a/bin/export-model-arch/src/export-model-arch/main.cc
+++ b/bin/export-model-arch/src/export-model-arch/main.cc
@@ -118,7 +118,7 @@ tl::expected<JsonSPModelExport, std::string>
 }
 
 int main(int argc, char **argv) {
-//! [utils/cli example]
+  //! [utils/cli example]
   CLISpec cli = empty_cli_spec();
 
   CLIArgumentKey arg_key_help = cli_add_help_flag(cli);
@@ -183,7 +183,7 @@ int main(int argc, char **argv) {
   bool sp_decompositition = cli_get_flag(parsed, key_sp_decomposition);
   bool dot = cli_get_flag(parsed, key_dot);
   bool preprocessed_dot = cli_get_flag(parsed, key_preprocessed_dot);
-//! [utils/cli example]
+  //! [utils/cli example]
 
   auto handle_error = [](auto const &result) {
     if (!result.has_value()) {
diff --git a/contributing.dox b/contributing.dox
index a420f950af..b86be494c5 100644
--- a/contributing.dox
+++ b/contributing.dox
@@ -1,3 +1,4 @@
+namespace FlexFlow {
 /**
 
 \page contributing Developers Guide
@@ -12,7 +13,7 @@
    1. If you have root permissions: [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)
 
    2. If you don't have root permissions: [DavHau/nix-portable](https://github.com/DavHau/nix-portable).
-      Note that nix-portable does not work particularly well if the Nix store is in NFS \ref contributing-footnote-1 "[1]" or other distributed file systems,
+      Note that nix-portable does not work particularly well if the Nix store is in <a href="https://en.wikipedia.org/wiki/Network_File_System">NFS</a> or other distributed file systems,
       so if you are running on an HPC cluster where the home directory is mounted via a distributed file system we recommend setting the
       <tt>NP_LOCATION</tt> environment to <tt>/tmp</tt> or some other non-NFS location.
 
@@ -37,7 +38,7 @@
       nix (Nix) 2.20.6
       \endverbatim
 
-2. Clone the %FlexFlow %Train repository (or, if you'd prefer, follow the alternative setup instructions in the [ff-dev](#ff-dev-optional) section)
+2. Clone the FlexFlow Train repository
 
 \verbatim
 $ FF_DIR="$HOME/flexflow-train" # or wherever else you want to put the repository
@@ -45,7 +46,7 @@ $ git clone --recursive git@github.com:flexflow/flexflow-train.git "$FF_DIR"
 ...
 \endverbatim
 
-3. Enter the nix-provided `default` development environment \ref contributing-footnote-2 "[2]"
+3. Enter the nix-provided `default` development environment (aka "dev shell")
 
 \verbatim
 $ cd "$FF_DIR"
@@ -82,11 +83,11 @@ Total Test time (real) =   8.64 sec
 \endverbatim
 
 If you don't, or if you see any tests failing, please double check that you have followed the instructions above.
-If you have and are still encountering an issue, please [contact us](#contact-us) with a detailed description of your platform and the commands you have run.
+If you have and are still encountering an issue, please \ref contributing-contact-us "contact us" with a detailed description of your platform and the commands you have run.
 
 \subsection contributing-editorconfig EditorConfig
 
-%FlexFlow %Train uses [EditorConfig](https://editorconfig.org/) to ensure consistent low-level details (indentation settings, character encoding, etc.) across different editors.
+FlexFlow Train uses [EditorConfig](https://editorconfig.org/) to ensure consistent low-level details (indentation settings, character encoding, etc.) across different editors.
 The EditorConfig file for %FlexFlow %Train can be found in [`.editorconfig`](./.editorconfig).
 If you are using vim, emacs, or another editor with built-in EditorConfig support (a full list of editors with built-in EditorConfig support can be found [here](https://editorconfig.org/#pre-installed))
 the configuration will be detected and applied without you needing to do anything.
@@ -113,8 +114,7 @@ You should see the additional GPU tests run. If you instead see a message like
 > `Error: ... Pass --skip-gpu-tests to skip running tests that require a GPU`
 
 Double check that you are correctly in the `gpu` devshell, not the `default` devshell.
-If you've confirmed that you are in the correct devshell and are still encountering issues, [contact us](#contact-us)
-with a detailed description of your platform and the commands you have run.
+If you've confirmed that you are in the correct devshell and are still encountering issues, \ref contributing-contact-us "contact us" with a detailed description of your platform and the commands you have run.
 
 \subsection contributing-nix-direnv nix-direnv (optional)
 
@@ -177,14 +177,14 @@ To help you get started, however, a list of common command invocations is includ
 
 We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows:
 
-1. [`tests.yml`](./.github/workflows/tests.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). Also ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`).
+1. [`tests.yml`](./.github/workflows/tests.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). Also ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see \ref contributing-proj) for more information on `proj`).
 2. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo.
 
 GPU machines for CI are managed using [runs-on](https://runs-on.com/).
 
 \section contributing-contributing Contributing to FlexFlow
 
-We actively welcome your pull requests. Note that we may already be working on the feature/fix you're looking for, so we suggest searching through the [open issues](https://github.com/flexflow/flexflow-train/issues), [open PRs](https://github.com/flexflow/flexflow-train/pulls), and [contacting us](#contact-us) to make sure you're not duplicating existing effort!
+We actively welcome your pull requests. Note that we may already be working on the feature/fix you're looking for, so we suggest searching through the [open issues](https://github.com/flexflow/flexflow-train/issues), [open PRs](https://github.com/flexflow/flexflow-train/pulls), and \ref contributing-contact-us "contacting us" to make sure you're not duplicating existing effort!
 
 The steps for getting changes merged into %FlexFlow are relatively standard:
 
@@ -208,9 +208,6 @@ For any reported bugs, please ensure that your description clear and has suffici
 
 By contributing to %FlexFlow %Train, you agree that your contributions will be licensed
 under the [LICENSE](./LICENSE) file in the root directory of this source tree.
-*/
-
-<hr/>
 
-1. \anchor contributing-footnote-1 <a href="https://en.wikipedia.org/wiki/Network_File_System">Network File System</a>
-2. \anchor contributing-footnote-2 aka "dev shell"
+*/
+}
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 52a62fc2f9..933fc234f5 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -1012,7 +1012,8 @@ RECURSIVE              = YES
 # run.
 
 EXCLUDE                = lib/realm-execution/include/realm-execution/realm.h \
-                         lib/runtime/
+                         lib/runtime/ \
+                         lib/local-pcg-execution/
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -1028,7 +1029,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = */tl/*
+EXCLUDE_PATTERNS       = */tl/* */test/* */hip/*
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
diff --git a/flake.lock b/flake.lock
index 9cd1e4bbae..359fdb19a9 100644
--- a/flake.lock
+++ b/flake.lock
@@ -66,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1769666654,
-        "narHash": "sha256-YFbOVi+Se3KDGFAoofYwYPUpEkEhsvdGdlYDR2I2XmI=",
+        "lastModified": 1773224815,
+        "narHash": "sha256-A7JWZNzYwzMZigyqm8IzyiBu82iFznp+oZJzx0eZjmU=",
         "ref": "refs/heads/master",
-        "rev": "64620d82f03478496eb00188184dbf48d56b560d",
-        "revCount": 143,
+        "rev": "d1db2bac548f66912d22023a3cece241ded1f503",
+        "revCount": 145,
         "type": "git",
         "url": "https://git.sr.ht/~lockshaw/proj"
       },
diff --git a/index.dox b/index.dox
index 9ae2794d59..cae3197cdf 100644
--- a/index.dox
+++ b/index.dox
@@ -1,3 +1,4 @@
+namespace FlexFlow {
 /**
 
 \mainpage FlexFlow Train
@@ -8,7 +9,7 @@
 
 The bulk of the FlexFlow source code is stored in the following folders:
 
-- \subpage lib "": The C++ code that makes up FlexFlow's core, split up into a number of libraries. You can find a description of each library [here](./lib/README.md).
+- \subpage lib "": The C++ code that makes up FlexFlow's core, split up into a number of libraries.
 - \subpage bin "": Command-line interfaces for FlexFlow and associated tools (all in C++). Generally, these are just thin wrappers that parse command-line arguments and then call out to functions defined in \ref lib for the actual processing/logic. You can find a description of each binary \ref bin "here".
 - `bindings`: Python (or any additional languages added in the future) bindings for FlexFlow Train. Still mostly unimplemented.
 - `docs`: Config files for documentation generators and code for generating diagrams. The actual documentation itself is included in the source directories/files in <a href="https://www.doxygen.nl/manual/index.html">Doxygen</a> syntax either in standalone `.dox` files or inline in header files.
@@ -38,3 +39,4 @@ FlexFlow Train is developed and maintained by teams at CMU, Facebook, Los Alamos
 FlexFlow Train uses Apache License 2.0.
 
 */
+}
diff --git a/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
index b08ca57851..aebca09ab8 100644
--- a/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
@@ -24,7 +24,7 @@ namespace FlexFlow {
  */
 SearchResult apply_substitution_and_update_machine_mapping(
     SearchResult const &mapped_pcg,
-    Substitution const &sub,
+    Substitution const &substitution,
     PCGPatternMatch const &match);
 
 } // namespace FlexFlow
diff --git a/lib/compiler/include/compiler/task_graph_simulator/index.dox b/lib/compiler/include/compiler/task_graph_simulator/index.dox
new file mode 100644
index 0000000000..c0a481b3a1
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page task-graph-simulator compiler/task_graph_simulator
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/index.dox b/lib/index.dox
index 8f7f8d5586..69c52ae378 100644
--- a/lib/index.dox
+++ b/lib/index.dox
@@ -44,7 +44,7 @@ digraph example {
     taskspec       [label="task-spec", URL="\ref task-spec", color="red", fontcolor="red"];
     localexecution [label="local-execution", URL="\ref local-execution", color="red", fontcolor="red"];
     realmexecution [label="realm-execution", URL="\ref realm-execution", color="red", fontcolor="red"];
-    realm          [label="realm", URL="\ref realm", color="red", fontcolor="red", style="dashed"];
+    realm          [label="Realm", color="red", fontcolor="red", style="dashed"];
 
     utils -> opattrs;
     opattrs -> pcg;
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
index bfa2169b0d..a3f8ead17f 100644
--- a/lib/kernels/src/kernels/accessor.cc
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -299,6 +299,7 @@ namespace std {
 
 using namespace ::FlexFlow;
 
+///\cond
 size_t hash<GenericTensorAccessorR>::operator()(
     GenericTensorAccessorR const &a) const {
   return get_std_hash(a.tie());
@@ -308,5 +309,6 @@ size_t hash<GenericTensorAccessorW>::operator()(
     GenericTensorAccessorW const &a) const {
   return get_std_hash(a.tie());
 }
+///\endcond
 
 } // namespace std
diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
index 797ce36e5d..8c3a30a82d 100644
--- a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
+++ b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
@@ -97,12 +97,12 @@ ComputationGraphInstance create_computation_graph_instance(
       });
 
   dg = perform_per_device_op_state_initialization(dg,
-                                           allocator,
-                                           profiling_settings,
-                                           device_handle,
-                                           iteration_config,
-                                           optimizer_attrs,
-                                           device_idx);
+                                                  allocator,
+                                                  profiling_settings,
+                                                  device_handle,
+                                                  iteration_config,
+                                                  optimizer_attrs,
+                                                  device_idx);
 
   // Compute the topological ordering of the graph
   auto [kwarg_graph, node_map] =
diff --git a/lib/local-pcg-execution/src/local-pcg-execution/mapped_per_device_op_states_group.cc b/lib/local-pcg-execution/src/local-pcg-execution/mapped_per_device_op_states_group.cc
index b94f7378ac..363e918190 100644
--- a/lib/local-pcg-execution/src/local-pcg-execution/mapped_per_device_op_states_group.cc
+++ b/lib/local-pcg-execution/src/local-pcg-execution/mapped_per_device_op_states_group.cc
@@ -76,7 +76,7 @@ std::tuple<
 }
 
 bidict<MachineSpaceCoordinate, OperatorAtomicTaskShardBinding> const &
-    MappedPerDeviceOpStatesGroup::get_shard_bindings() const {
+    MappedPerDeviceOpStatesGroup::get_per_device_op_states() const {
   return this->shard_bindings;
 }
 
diff --git a/lib/models/include/models/bert/index.dox b/lib/models/include/models/bert/index.dox
new file mode 100644
index 0000000000..f923a93480
--- /dev/null
+++ b/lib/models/include/models/bert/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-bert models/bert
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/candle_uno/candle_uno.h b/lib/models/include/models/candle_uno/candle_uno.h
index bee398b71d..efc99653be 100644
--- a/lib/models/include/models/candle_uno/candle_uno.h
+++ b/lib/models/include/models/candle_uno/candle_uno.h
@@ -34,7 +34,8 @@ CandleUnoConfig get_default_candle_uno_config();
  * @param config The config of the Candle Uno model.
  * @return The PCG of a Transformer model.
  */
-ComputationGraph get_candle_uno_computation_graph(CandleUnoConfig const &config);
+ComputationGraph
+    get_candle_uno_computation_graph(CandleUnoConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/include/models/candle_uno/index.dox b/lib/models/include/models/candle_uno/index.dox
new file mode 100644
index 0000000000..7845dca599
--- /dev/null
+++ b/lib/models/include/models/candle_uno/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-candle-uno models/candle_uno
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/dlrm/index.dox b/lib/models/include/models/dlrm/index.dox
new file mode 100644
index 0000000000..1c952bc8f5
--- /dev/null
+++ b/lib/models/include/models/dlrm/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-dlrm models/dlrm
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/inception_v3/index.dox b/lib/models/include/models/inception_v3/index.dox
new file mode 100644
index 0000000000..006e7d6334
--- /dev/null
+++ b/lib/models/include/models/inception_v3/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-inception models/inception_v3
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/split_test/index.dox b/lib/models/include/models/split_test/index.dox
index c5aa84706a..501eb1111d 100644
--- a/lib/models/include/models/split_test/index.dox
+++ b/lib/models/include/models/split_test/index.dox
@@ -1,7 +1,10 @@
 namespace FlexFlow {
 /**
 
-@page split-test models/split_test/
+@page split-test models/split_test
+
+\todo
+  \@lockshaw Add docs and example(s).
 
 */
 }
diff --git a/lib/models/include/models/transformer/index.dox b/lib/models/include/models/transformer/index.dox
new file mode 100644
index 0000000000..551735929c
--- /dev/null
+++ b/lib/models/include/models/transformer/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-transformer models/transformer
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/transformer/transformer.h b/lib/models/include/models/transformer/transformer.h
index da6bfc3a9d..20636bc524 100644
--- a/lib/models/include/models/transformer/transformer.h
+++ b/lib/models/include/models/transformer/transformer.h
@@ -40,7 +40,8 @@ TransformerConfig get_default_transformer_config();
  * @param config The config of Transformer model.
  * @return The PCG of a Transformer model.
  */
-ComputationGraph get_transformer_computation_graph(TransformerConfig const &config);
+ComputationGraph
+    get_transformer_computation_graph(TransformerConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/index.dox b/lib/models/index.dox
index 9b9b308976..8046ef125f 100644
--- a/lib/models/index.dox
+++ b/lib/models/index.dox
@@ -6,11 +6,11 @@ namespace FlexFlow {
 \brief Pre-built \ref ComputationGraph ""s for various models for use in testing and evalutation.
 
 \section real-models Real Models
-- \subpage bert "BERT"
-- \subpage candle-uno "Candle UNO"
-- \subpage dlrm "DLRM"
-- \subpage inception-v3 "Inception v3"
-- \subpage transformer "Transformer"
+- \subpage models-bert "BERT"
+- \subpage models-candle-uno "Candle UNO"
+- \subpage models-dlrm "DLRM"
+- \subpage models-inception "Inception v3"
+- \subpage models-transformer "Transformer"
 
 \section test-models Artificial Models for Testing
 - \subpage split-test
diff --git a/lib/models/src/models/split_test/split_test.cc b/lib/models/src/models/split_test/split_test.cc
index a091ba3ce7..13cc42b356 100644
--- a/lib/models/src/models/split_test/split_test.cc
+++ b/lib/models/src/models/split_test/split_test.cc
@@ -5,7 +5,7 @@
 namespace FlexFlow {
 
 ComputationGraph get_split_test_computation_graph(positive_int batch_size) {
-//! [ComputationGraphBuilder example]
+  //! [ComputationGraphBuilder example]
   ComputationGraphBuilder cgb;
 
   positive_int layer_dim1 = 256_p;
@@ -35,7 +35,7 @@ ComputationGraph get_split_test_computation_graph(positive_int batch_size) {
   t = cgb.softmax(t);
 
   return cgb.computation_graph;
-//! [ComputationGraphBuilder example]
+  //! [ComputationGraphBuilder example]
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/initializers/uniform_initializer_attrs.h b/lib/op-attrs/include/op-attrs/initializers/uniform_initializer_attrs.h
index 67873c32b1..674f18b919 100644
--- a/lib/op-attrs/include/op-attrs/initializers/uniform_initializer_attrs.h
+++ b/lib/op-attrs/include/op-attrs/initializers/uniform_initializer_attrs.h
@@ -4,6 +4,7 @@
 #include "op-attrs/initializers/uniform_initializer_attrs.dtg.h"
 #include <rapidcheck.h>
 
+///\cond
 namespace rc {
 
 template <>
@@ -12,5 +13,6 @@ struct Arbitrary<::FlexFlow::UniformInitializerAttrs> {
 };
 
 } // namespace rc
+///\endcond
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/index.dox b/lib/op-attrs/include/op-attrs/ops/index.dox
index 669e9aa027..6e5465ca68 100644
--- a/lib/op-attrs/include/op-attrs/ops/index.dox
+++ b/lib/op-attrs/include/op-attrs/ops/index.dox
@@ -30,19 +30,16 @@ More specifically, this consists of the following pieces:
   \snippet lib/op-attrs/src/op-attrs/ops/linear.cc parallel shape inference composition example
 
 - A function for inferring the slot names for the incoming tensors (
-  \ref "std::unordered_map<TensorSlotName, IncomingTensorRole> get_linear_incoming_tensor_roles(LinearAttrs const &)")
+  \ref "get_linear_incoming_tensor_roles(LinearAttrs const &)")
 - Functions for computing the dependencies between shards of the parallelized input, weight, and output tensors, e.g.,
-  - \ref "OperatorSpaceToParallelTensorSpaceMapping get_operator_to_input_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
-  - \ref "OperatorSpaceToParallelTensorSpaceMapping get_operator_to_projection_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
-  - \ref "OperatorSpaceToParallelTensorSpaceMapping get_operator_to_output_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+  - \ref "get_operator_to_input_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+  - \ref "get_operator_to_projection_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+  - \ref "get_operator_to_output_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
 
 Note that as different operators have different numbers of inputs, etc. the number and signatures of these functions may be different for different operators. While keeping the structure of the various operators similar is makes it easier to understand, it's not strictly necessary: the code that calls these functions for a generic operator allows custom behavior for each operator, which allows us to have a bit more freedom to evolve operator definitions over time:
 - \ref get_operator_to_ptensor_mappings (and associated functions in \ref get_operator_space_to_parallel_tensor_space_mappings.h)
 - \ref "get_incoming_tensor_roles(ComputationGraphOpAttrs const &)" (and associated functions in \ref get_incoming_tensor_roles.h)
 - \ref "get_output_shapes(ComputationGraphOpAttrs const &, std::unordered_map<TensorSlotName, TensorShape> const &input_shapes)" (and associated functions in \ref op-attrs/shape_inference.h)
 
-
-
-
 */
 }
diff --git a/lib/op-attrs/index.dox b/lib/op-attrs/index.dox
index 51bfd31db3..86cce3594b 100644
--- a/lib/op-attrs/index.dox
+++ b/lib/op-attrs/index.dox
@@ -1,3 +1,4 @@
+namespace FlexFlow {
 /**
 
 \page op-attrs op-attrs
@@ -7,12 +8,12 @@
 Key pieces include:
 
 - Representing tensors in the compiler:
-  \ref FlexFlow::TensorShape, \ref FlexFlow::TensorDims
+  \ref TensorShape, \ref TensorDims
 - Representing parallel/sharded/distributed tensors in the compiler:
-  \ref FlexFlow::ParallelTensorShape, \ref FlexFlow::ParallelTensorDimDegrees
+  \ref ParallelTensorShape, \ref ParallelTensorDimDegrees
 - The actual operator definitions: \subpage op-attrs-ops "ops/"
 - Computing data dependencies of operators computing over parallel tensors:
-  \ref get_operator_to_parallel_tensor_space_mappings.h
-
+  \ref get_operator_space_to_parallel_tensor_space_mappings.h
 
 */
+}
diff --git a/lib/op-attrs/src/op-attrs/initializers/uniform_initializer_attrs.cc b/lib/op-attrs/src/op-attrs/initializers/uniform_initializer_attrs.cc
index 2c7065c9cc..8c87f7ce27 100644
--- a/lib/op-attrs/src/op-attrs/initializers/uniform_initializer_attrs.cc
+++ b/lib/op-attrs/src/op-attrs/initializers/uniform_initializer_attrs.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/initializers/uniform_initializer_attrs.h"
 
+///\cond
 namespace rc {
 
 using ::FlexFlow::UniformInitializerAttrs;
@@ -19,3 +20,4 @@ Gen<UniformInitializerAttrs> Arbitrary<UniformInitializerAttrs>::arbitrary() {
 };
 
 } // namespace rc
+///\endcond
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index ddc9ee312a..4e4cacc731 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -7,9 +7,10 @@
 namespace FlexFlow {
 
 /**
- * \brief A helper interface for building ComputationGraph in a pytorch (i.e., weight-implicit) style.
+ * \brief A helper interface for building ComputationGraph in a pytorch (i.e.,
+ * weight-implicit) style.
  *
- * For an example of how to use it, see the following code from \ref models:
+ * For an example of how to use it, see the following code from \ref models "":
  * \snippet lib/models/src/models/split_test/split_test.cc ComputationGraphBuilder example
  */
 struct ComputationGraphBuilder {
diff --git a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
index d39652a7e2..8136d0e71c 100644
--- a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
+++ b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
@@ -5,6 +5,7 @@
 
 using namespace ::FlexFlow;
 
+///\cond
 namespace nlohmann {
 
 V1BinarySPDecomposition
@@ -82,3 +83,4 @@ void adl_serializer<V1BinaryParallelSplit>::to_json(
 }
 
 } // namespace nlohmann
+///\endcond
diff --git a/lib/realm-execution/include/realm-execution/distributed_ff_handle.h b/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
index e581694c86..2a500ff150 100644
--- a/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
@@ -9,8 +9,8 @@
 namespace FlexFlow {
 
 /**
- * \brief Tracks the \ref device_handle_t (i.e., FFHandle) for each %GPU, both local
- * and remote. %GPUs here are represented by \ref Realm::Processor ""s.
+ * \brief Tracks the \ref device_handle_t (i.e., FFHandle) for each %GPU, both
+ * local and remote. A GPU here is represented by a Realm::Processor.
  */
 struct DistributedFfHandle {
 public:
diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
index 39f9848a87..66cc07af75 100644
--- a/lib/realm-execution/include/realm-execution/instance_allocation.h
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -12,9 +12,10 @@ namespace FlexFlow {
  * on the device represented by \p device_coord.
  */
 std::pair<Realm::RegionInstance, Realm::Event>
-    perform_instance_allocation_for_value(MachineSpaceCoordinate const &device_coord,
-                                          DynamicValueAttrs const &value,
-                                          RealmContext &ctx);
+    perform_instance_allocation_for_value(
+        MachineSpaceCoordinate const &device_coord,
+        DynamicValueAttrs const &value,
+        RealmContext &ctx);
 
 /**
  * @brief Allocates the (potentially remote) Realm instances for all of the
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance.h
index e468bcfb97..c615244722 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance.h
@@ -26,14 +26,15 @@ namespace FlexFlow {
 /**
  * \brief The main public interface for the Realm backend.
  * Takes a \ref MappedParallelComputationGraph and lowers it through
- * \ref DynamicOpenDataflowGraph to get the fully-specified execution order of tasks
- * to be issued. (Note: this is a parallel execution so execution order may not
- * match the order in which operations are issued.) Also tracks the allocation
- * of realm instances for tensors through its \ref TensorInstanceBacking.
+ * \ref DynamicOpenDataflowGraph to get the fully-specified execution order of
+ * tasks to be issued. (Note: this is a parallel execution so execution order
+ * may not match the order in which operations are issued.) Also tracks the
+ * allocation of realm instances for tensors through its \ref
+ * TensorInstanceBacking.
  *
- * \note \ref PCGInstance is primarily just a container for the various structs held
- * inside it. The actual initialization and training iteration functionality is
- * held in \ref create_pcg_instance and \ref
+ * \note \ref PCGInstance is primarily just a container for the various structs
+ * held inside it. The actual initialization and training iteration
+ * functionality is held in \ref create_pcg_instance and \ref
  * perform_update_pass_for_pcg_instance, respectively.
  *
  */
@@ -97,7 +98,8 @@ PCGInstance create_pcg_instance(
 /**
  * \brief Dispatch a training iteration for a \ref PCGInstance.
  *
- * To dispatch just a piece of a training iteration, see the following functions:
+ * To dispatch just a piece of a training iteration, see the following
+ * functions:
  * - \ref perform_forward_pass_for_pcg_instance
  * - \ref perform_backward_pass_for_pcg_instance
  * - \ref perform_update_pass_for_pcg_instance
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index c2b1180be7..0d0b412130 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -18,7 +18,7 @@ namespace FlexFlow {
  * @brief An interface that wraps the rest of Realm and protects against certain
  * classes of bugs, such as shutdown bugs.
  *
- * @warn Do NOT call Realm directly unless you know what you are doing.
+ * @warning Do NOT call Realm directly unless you know what you are doing.
  */
 struct RealmContext {
 public:
@@ -80,7 +80,8 @@ struct RealmContext {
   /**
    * \brief Compact **and clear** the outstanding event queue
    *
-   * \warning **User must block** on event or else use it, or it **will be lost** (potentially resulting in a shutdown hang).
+   * \warning **User must block** on event or else use it, or it **will be
+   * lost** (potentially resulting in a shutdown hang).
    */
   [[nodiscard]] Realm::Event merge_outstanding_events();
 
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 65fcb83a3b..287218749e 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -11,8 +11,8 @@ namespace FlexFlow {
 
 /**
  * @brief Manages the initialization and shutdown of the Realm runtime.
- * Provides the interface to launch the \ref term-controller that runs the rest of the computation
-* (i.e., \ref start_controller).
+ * Provides the interface to launch the \ref term-controller that runs the rest
+ * of the computation (i.e., \ref RealmManager::start_controller).
  */
 struct RealmManager : private RealmContext {
 public:
@@ -28,8 +28,8 @@ struct RealmManager : private RealmContext {
    * one controller for the entire machine. The controller may be a function
    * that closes over data (i.e., a lambda).
    *
-   * @warn If the provided function closes over data, **the user must block on
-   * the resulting event** to ensure it remains in scope until the controller
+   * @warning If the provided function closes over data, **the user must block
+   * on the resulting event** to ensure it remains in scope until the controller
    * completes.
    */
   [[nodiscard]] Realm::Event
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
index 7b919edda7..7219c5c07f 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
@@ -7,15 +7,16 @@
 namespace FlexFlow {
 
 /**
- * \brief A stub function to work around Realm not allowing lambdas to be be registered as Realm tasks.
- * Takes the desired lambda to run as the \ref term-controller as an argument and immediately calls it.
+ * \brief A stub function to work around Realm not allowing lambdas to be be
+ * registered as Realm tasks. Takes the desired lambda to run as the \ref
+ * term-controller as an argument and immediately calls it.
  */
 void controller_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Dispatches the \ref term-controller task. Packages up the provided \ref std::function and
- * passes it along to \ref controller_task_body.
+ * \brief Dispatches the \ref term-controller task. Packages up the provided \c
+ * std::function and passes it along to \ref controller_task_body.
  */
 Realm::Event
     collective_spawn_controller_task(RealmContext &ctx,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
index ae7f5c8691..f6a07e97d4 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 /**
  * \brief The function registered as a Realm task for returning the
- * asynchronously-initialized \ref FFHandle. Dispatched by \ref
+ * asynchronously-initialized \ref PerDeviceFFHandle. Dispatched by \ref
  * spawn_ff_handle_init_return_task.
  *
  * To understand how this fits into the broader structure of \ref
@@ -20,7 +20,7 @@ void ff_handle_init_return_task_body(
 
 /**
  * \brief Launches the task (\ref ff_handle_init_return_task_body) for returning
- * the asynchronously-initialized \ref FFHandle.
+ * the asynchronously-initialized \ref PerDeviceFFHandle.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
index ff87b1fa4d..64384b6ae6 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 /**
  * \brief The function registered as a Realm task for starting the asynchronous
- * initialization of the \ref FFHandle. Dispatched by \ref
+ * initialization of the \ref PerDeviceFFHandle. Dispatched by \ref
  * spawn_ff_handle_init_task.
  *
  * To understand how this fits into the broader structure of \ref
@@ -20,7 +20,7 @@ void ff_handle_init_task_body(
 
 /**
  * \brief Launches the task (\ref ff_handle_init_return_task_body) for starting
- * the asynchronous initialization of the \ref FFHandle.
+ * the asynchronous initialization of the \ref PerDeviceFFHandle.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
index e527314346..9f9b467e46 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
@@ -1,4 +1,4 @@
-namespace {
+namespace FlexFlow {
 /**
 
 \page realm-execution-tasks tasks/
@@ -8,7 +8,7 @@ namespace {
 \section tasks-one-part Individual Tasks
 
 Invidividual tasks are just normal Realm tasks, which are implemented in \ref realm-execution as a
-wrapper function for spawning a task (e.g., \ref collective_spawn_controller_task) and a task body which is the actual Realm task implementation (e.g., \ref controller_task_body). Each also has an optional corresponding <em>TaskArgument</em> (e.g., \ref OpTaskArgs) object to provide a structure to the arguments passed from the wrapper to the task body. In cases where the %TaskArgument object is not trivially JSON-serializable, a corresponding JSON-serializable task argument type is provided (e.g., \ref SerializeableOpTaskArgs).
+wrapper function for spawning a task (e.g., \ref collective_spawn_controller_task) and a task body which is the actual Realm task implementation (e.g., \ref controller_task_body). Each also has an optional corresponding <em>TaskArgument</em> (e.g., \ref OpTaskArgs) object to provide a structure to the arguments passed from the wrapper to the task body. In cases where the %TaskArgument object is not trivially JSON-serializable, a corresponding JSON-serializable task argument type is provided (e.g., \ref SerializableOpTaskArgs).
 
 \subsection tasks-controller-tasks Controller Tasks
 
@@ -20,13 +20,13 @@ Implements all of the operator tasks, i.e., the tasks that are executed during t
 
 \section tasks-two-part Paired Tasks
 
-The other two types of tasks are implemented as pairs of tasks: one to begin initializing a value (e.g., \ref spawn_ff_handle_init_task), and another to return the initialized value when it's ready (e.g., \ref spawn_ff_handle_init_return_task). As with \ref task-one-part, they have an optional corresponding tasks argument type and a potential serializable task argument type.
+The other two types of tasks are implemented as pairs of tasks: one to begin initializing a value (e.g., \ref spawn_ff_handle_init_task), and another to return the initialized value when it's ready (e.g., \ref spawn_ff_handle_init_return_task). As with \ref tasks-one-part, they have an optional corresponding tasks argument type and a potential serializable task argument type.
 
 \todo \@Elliott why is the paired tasks structure required? Is it a performance optimization, or simply necessary given the set of primitives Realm provides?
 
 \subsection tasks-ffhandle-init FFHandle Initialization Tasks
 
-For initializing the \ref FFHandle for each GPU. Implemented in \ref ff_handle_init_task.h and \ref ff_handle_init_return_task.h.
+For initializing the \ref PerDeviceFFHandle for each GPU. Implemented in \ref ff_handle_init_task.h and \ref ff_handle_init_return_task.h.
 
 \subsection tasks-op-state-init PerDeviceOpState Initialization Tasks
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 6a0ac53053..4aa0329a96 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -23,31 +23,31 @@ namespace FlexFlow {
 void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Launches the task (\ref op_task_body), for a \ref DynamicNodeInvocation
- * using Realm.
+ * \brief Launches the task (\ref op_task_body), for a \ref
+ * DynamicNodeInvocation using Realm.
  *
  * The task launch process functions a bit differently to that used in the
  * previous FlexFlow codebase. Rather than having a function registered with
  * realm/legion for every \ref task_id_t, we now have only a few functions
  * registered: \ref op_task_body, \ref ff_handle_init_task_body,
- * \ref per_device_op_state_init_return_task_body, and \ref controller_task_body (see
- * \ref register_all_tasks for where this list comes from), and in fact only
- * \ref op_task_body is launched by \ref spawn_op_task. Each of these registered
- * tasks use the serialized arguments sent to them to dispatch to the correct
- * implementatin in task-spec: for example, if we are trying to launch the task
- * for a Conv2d operator, this function will actually dispatch a call to \ref
- * op_task_body with a serialized \ref OpTaskArgs as an argument, and then \ref
- * op_task_body will deserialize the argument, determine that we are trying to
- * launch the forward pass of Conv2d, use \ref execute_dynamic_node_invocation
- * (which then uses \ref call_fwd_task_impl) to actually call the function in
- * lib/task-spec/src/task-spec/ops/impl/conv_2d.cc
+ * \ref per_device_op_state_init_return_task_body, and \ref controller_task_body
+ * (see \ref register_all_tasks for where this list comes from), and in fact
+ * only \ref op_task_body is launched by \ref spawn_op_task. Each of these
+ * registered tasks use the serialized arguments sent to them to dispatch to the
+ * correct implementatin in task-spec: for example, if we are trying to launch
+ * the task for a Conv2d operator, this function will actually dispatch a call
+ * to \ref op_task_body with a serialized \ref OpTaskArgs as an argument, and
+ * then \ref op_task_body will deserialize the argument, determine that we are
+ * trying to launch the forward pass of Conv2d, use \ref
+ * execute_dynamic_node_invocation (which then uses \ref call_fwd_task_impl) to
+ * actually call the function in lib/task-spec/src/task-spec/ops/impl/conv_2d.cc
  *
  * The above also means that we don't have a separate
- * \ref ITaskArgumentAccessor subclass for realm-execution. Instead we ship over the
- * information on the corresponding realm instances over to the remote node,
+ * \ref ITaskArgumentAccessor subclass for realm-execution. Instead we ship over
+ * the information on the corresponding realm instances over to the remote node,
  * grab the corresponding pointer/\ref GenericTensorAccessor, and then use
- * \ref LocalTaskArgumentAccessor for the actual argument access as, by this point,
- * everything is local.
+ * \ref LocalTaskArgumentAccessor for the actual argument access as, by this
+ * point, everything is local.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
index bbe640a376..46a4bab727 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
@@ -20,8 +20,8 @@ void per_device_op_state_init_return_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Launches the task (\ref per_device_op_state_init_return_task_body) for returning
- * the asynchronously-initialized \ref PerDeviceOpState.
+ * \brief Launches the task (\ref per_device_op_state_init_return_task_body) for
+ * returning the asynchronously-initialized \ref PerDeviceOpState.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
index 1c4675da2a..95b768a245 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
@@ -26,8 +26,8 @@ void per_device_op_state_init_task_body(
     void const *, size_t, void const *, size_t, Realm::Processor);
 
 /**
- * \brief Launches the task (\ref per_device_op_state_init_task_body) for starting
- * the asynchronous initialization of the \ref PerDeviceOpState.
+ * \brief Launches the task (\ref per_device_op_state_init_task_body) for
+ * starting the asynchronous initialization of the \ref PerDeviceOpState.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h
index 625475f0ae..0d63d3610c 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h
@@ -7,8 +7,7 @@
 namespace FlexFlow {
 
 SerializableFfHandleInitTaskArgs
-    ff_handle_init_task_args_to_serializable(
-        FfHandleInitTaskArgs const &);
+    ff_handle_init_task_args_to_serializable(FfHandleInitTaskArgs const &);
 
 FfHandleInitTaskArgs ff_handle_init_task_args_from_serializable(
     SerializableFfHandleInitTaskArgs const &);
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
index 33bf1abd96..62454d168f 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
@@ -6,10 +6,12 @@
 
 namespace FlexFlow {
 
-SerializablePerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_to_serializable(
-    PerDeviceOpStateInitTaskArgs const &);
-PerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_from_serializable(
-    SerializablePerDeviceOpStateInitTaskArgs const &);
+SerializablePerDeviceOpStateInitTaskArgs
+    per_device_op_state_init_task_args_to_serializable(
+        PerDeviceOpStateInitTaskArgs const &);
+PerDeviceOpStateInitTaskArgs
+    per_device_op_state_init_task_args_from_serializable(
+        SerializablePerDeviceOpStateInitTaskArgs const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
index 8ff08abe49..a956d53643 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -7,7 +7,7 @@
 namespace FlexFlow {
 
 /**
- * \relates task_id_t
+ * \brief Registers a function as a Realm task.
  *
  * \warning The event returned by this function <em>must be consumed</em> or
  * else Realm may not shut down properly.
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
index b307492d6b..299df5cc3f 100644
--- a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
@@ -3,16 +3,16 @@
 
 #include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/realm.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
 #include <optional>
-#include "realm-execution/realm.h"
 
 namespace FlexFlow {
 
 /**
  * \brief Retrieves the \ref task_id_t for a \ref DynamicNodeAttrs, with
- * a return value of \ref std::nullopt to be treated as a no-op task.
+ * a return value of \c std::nullopt to be treated as a no-op task.
  */
 std::optional<task_id_t>
     get_task_id_for_op(DynamicNodeAttrs const &,
@@ -30,12 +30,9 @@ std::optional<task_id_t>
 
 /**
  * \brief Convert a \ref FlexFlow::task_id_t into a Realm task ID.
- *
- * \relates task_id_t
  */
 Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t);
 
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-execution/index.dox b/lib/realm-execution/index.dox
index ea1545987f..351d6b2d21 100644
--- a/lib/realm-execution/index.dox
+++ b/lib/realm-execution/index.dox
@@ -20,13 +20,13 @@ This is a single-controller implementation. That means the controller (the task
 - \ref RealmContext "": \copybrief RealmContext
 - \subpage realm-execution-tasks "include/realm-execution/tasks": The Realm task implementations and their supporting infrastructure.
   - \ref "lib/realm-execution/include/realm-execution/tasks/impl" "impl/": the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
-  - \ref "lib/realm-execution/include/realm-execution/tasks/serializer/" "serializer/": additional support for serializing Realm data types.
-  - \ref realm_task_registry.h: Manages the registration of Realm tasks. All Realm tasks go through this interface.
-  - \ref task_id_t.h and \ref realm_task_id_t.h: Types to represent Realm tasks, along with an encoding to Realm's native task ID type.
+  - \ref lib/realm-execution/include/realm-execution/tasks/serializer/ "serializer/": additional support for serializing Realm data types.
+  - \ref "realm_task_registry.h": Manages the registration of Realm tasks. All Realm tasks go through this interface.
+  - \ref "task_id_t.h": Type (\ref task_id_t) to represent Realm tasks, along with an encoding to Realm's native task ID type.
 - Helper components (mainly used within \ref PCGInstance)
-  - \ref DistributedDeviceHandle "": represents a distributed device handle (i.e., device handles on all the GPUs on the system), for convenience.
+  - \ref "DistributedFfHandle": represents a distributed \ref PerDeviceFFHandle (i.e., a \ref PerDeviceFFHandle on each of the GPUs in the machine), for convenience.
   - \ref DependencySet "": tracks dependencies during execution of tasks.
-  - \ref "distributed_device_state_initialization.h": performs device state initialization of dynamic graph nodes and returns the resulting \ref PerDeviceOpStateBacking.
+  - \ref "distributed_per_device_op_state_initialization.h": performs distributed initialization of \ref "PerDeviceOpState"s and packages the results into a \ref PerDeviceOpStateBacking.
   - \ref "instance_allocation.h": allocates instances for tensors in the dynamic graph and returns the resulting \ref TensorInstanceBacking.
 
 \section realm-execution-todo Outstanding TODOs
diff --git a/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc b/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
index 185d7e0c89..986401956a 100644
--- a/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
@@ -17,9 +17,9 @@ DeviceSpecificManagedPerDeviceFFHandle const &
 
 DistributedFfHandle
     create_distributed_ff_handle(RealmContext &ctx,
-                                     size_t workSpaceSize,
-                                     bool allowTensorOpMathConversion,
-                                     Realm::Event precondition) {
+                                 size_t workSpaceSize,
+                                 bool allowTensorOpMathConversion,
+                                 Realm::Event precondition) {
   std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
       handles;
 
@@ -36,11 +36,11 @@ DistributedFfHandle
 
   for (auto &[proc, handle] : handles) {
     spawn_ff_handle_init_task(ctx,
-                                  proc,
-                                  workSpaceSize,
-                                  allowTensorOpMathConversion,
-                                  &handle,
-                                  precondition);
+                              proc,
+                              workSpaceSize,
+                              allowTensorOpMathConversion,
+                              &handle,
+                              precondition);
   }
 
   ctx.get_outstanding_events().wait();
diff --git a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
index 8612fa4b97..1e02fcf5d5 100644
--- a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
@@ -46,15 +46,15 @@ PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
 
     std::optional<Realm::Event> completion_event =
         spawn_per_device_op_state_init_task(ctx,
-                                     target_proc,
-                                     invocation,
-                                     tensor_backing,
-                                     profiling_settings,
-                                     device_handle.at(target_proc),
-                                     iteration_config,
-                                     optimizer_attrs,
-                                     device_state_ptr,
-                                     precondition);
+                                            target_proc,
+                                            invocation,
+                                            tensor_backing,
+                                            profiling_settings,
+                                            device_handle.at(target_proc),
+                                            iteration_config,
+                                            optimizer_attrs,
+                                            device_state_ptr,
+                                            precondition);
 
     if (completion_event.has_value()) {
       device_state_map.insert(std::pair{invocation, device_state_ptr});
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index 2801a70940..4ef2919b10 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -22,9 +22,10 @@
 namespace FlexFlow {
 
 std::pair<Realm::RegionInstance, Realm::Event>
-    perform_instance_allocation_for_value(MachineSpaceCoordinate const &device_coord,
-                                          DynamicValueAttrs const &value,
-                                          RealmContext &ctx) {
+    perform_instance_allocation_for_value(
+        MachineSpaceCoordinate const &device_coord,
+        DynamicValueAttrs const &value,
+        RealmContext &ctx) {
   ASSERT(value.accessor == std::nullopt);
 
   TensorShape shape = get_piece_shape(value.parallel_tensor_shape.value());
@@ -53,8 +54,8 @@ TensorInstanceBacking perform_instance_allocation(
     } else {
       if (!contains_key(result.backing, v)) {
         MachineSpaceCoordinate device_coord = assert_unwrap(n.device_coord);
-        result.backing.insert(
-            std::pair{v, perform_instance_allocation_for_value(device_coord, v, ctx)});
+        result.backing.insert(std::pair{
+            v, perform_instance_allocation_for_value(device_coord, v, ctx)});
       }
       return result.backing.at(v);
     }
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
index 552da3cb01..1a90052fa7 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
@@ -20,10 +20,10 @@ struct FfHandleInitReturnTaskArgs {
 };
 
 void ff_handle_init_return_task_body(void const *args,
-                                         size_t arglen,
-                                         void const *userdata,
-                                         size_t userlen,
-                                         Realm::Processor proc) {
+                                     size_t arglen,
+                                     void const *userdata,
+                                     size_t userlen,
+                                     Realm::Processor proc) {
   ASSERT(arglen == sizeof(FfHandleInitReturnTaskArgs));
   FfHandleInitReturnTaskArgs task_args =
       *reinterpret_cast<FfHandleInitReturnTaskArgs const *>(args);
@@ -38,8 +38,7 @@ Realm::Event spawn_ff_handle_init_return_task(
     DeviceSpecificManagedPerDeviceFFHandle const &result,
     DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
     Realm::Event precondition) {
-  FfHandleInitReturnTaskArgs task_args{
-      result, origin_proc, origin_result_ptr};
+  FfHandleInitReturnTaskArgs task_args{result, origin_proc, origin_result_ptr};
 
   return ctx.spawn_task(origin_proc,
                         task_id_t::DEVICE_HANDLE_INIT_RETURN_TASK_ID,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
index ca0a5bf2cd..86d03e45f3 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
@@ -11,8 +11,8 @@ namespace FlexFlow {
 
 static std::optional<ManagedPerDeviceFFHandle *>
     make_ff_handle_for_processor(Realm::Processor processor,
-                                     size_t workSpaceSize,
-                                     bool allowTensorOpMathConversion) {
+                                 size_t workSpaceSize,
+                                 bool allowTensorOpMathConversion) {
   switch (processor.kind()) {
     case Realm::Processor::LOC_PROC:
       return std::nullopt;
@@ -29,29 +29,26 @@ static std::optional<ManagedPerDeviceFFHandle *>
 }
 
 void ff_handle_init_task_body(void const *args,
-                                  size_t arglen,
-                                  void const *userdata,
-                                  size_t userlen,
-                                  Realm::Processor proc) {
-  FfHandleInitTaskArgs task_args =
-      ff_handle_init_task_args_from_serializable(
-          deserialize_task_args<SerializableFfHandleInitTaskArgs>(args,
-                                                                      arglen));
+                              size_t arglen,
+                              void const *userdata,
+                              size_t userlen,
+                              Realm::Processor proc) {
+  FfHandleInitTaskArgs task_args = ff_handle_init_task_args_from_serializable(
+      deserialize_task_args<SerializableFfHandleInitTaskArgs>(args, arglen));
 
   RealmContext ctx{proc};
   DeviceSpecificManagedPerDeviceFFHandle managed_handle =
       make_device_specific_managed_handle(
           ctx.get_current_device_idx(),
-          make_ff_handle_for_processor(
-              proc,
-              task_args.workSpaceSize,
-              task_args.allowTensorOpMathConversion));
+          make_ff_handle_for_processor(proc,
+                                       task_args.workSpaceSize,
+                                       task_args.allowTensorOpMathConversion));
 
   spawn_ff_handle_init_return_task(ctx,
-                                       task_args.origin_proc,
-                                       managed_handle,
-                                       task_args.origin_result_ptr,
-                                       Realm::Event::NO_EVENT);
+                                   task_args.origin_proc,
+                                   managed_handle,
+                                   task_args.origin_result_ptr,
+                                   Realm::Event::NO_EVENT);
 }
 
 Realm::Event spawn_ff_handle_init_task(
@@ -69,8 +66,8 @@ Realm::Event spawn_ff_handle_init_task(
       result_ptr,
   };
 
-  std::string serialized_args = serialize_task_args(
-      ff_handle_init_task_args_to_serializable(task_args));
+  std::string serialized_args =
+      serialize_task_args(ff_handle_init_task_args_to_serializable(task_args));
   return ctx.spawn_task(target_proc,
                         task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
                         serialized_args.data(),
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index e86574c9b9..2eaec4d6ea 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -74,13 +74,13 @@ Realm::Event spawn_op_task(
     Realm::Event precondition) {
 
   OpTaskArgs task_args = OpTaskArgs{
-    invocation,
-    tensor_backing,
-    device_state,
-    profiling_settings,
-    device_handle,
-    iteration_config,
-    optimizer_attrs,
+      invocation,
+      tensor_backing,
+      device_state,
+      profiling_settings,
+      device_handle,
+      iteration_config,
+      optimizer_attrs,
   };
 
   std::string serialized_args =
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
index af04a835f2..222ddb28b8 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
@@ -20,10 +20,10 @@ struct PerDeviceOpStateInitReturnTaskArgs {
 };
 
 void per_device_op_state_init_return_task_body(void const *args,
-                                        size_t arglen,
-                                        void const *userdata,
-                                        size_t userlen,
-                                        Realm::Processor proc) {
+                                               size_t arglen,
+                                               void const *userdata,
+                                               size_t userlen,
+                                               Realm::Processor proc) {
   ASSERT(arglen == sizeof(PerDeviceOpStateInitReturnTaskArgs));
   PerDeviceOpStateInitReturnTaskArgs task_args =
       *reinterpret_cast<PerDeviceOpStateInitReturnTaskArgs const *>(args);
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
index 50231c554b..c5ff8f39be 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
@@ -19,14 +19,14 @@
 namespace FlexFlow {
 
 void per_device_op_state_init_task_body(void const *args,
-                                 size_t arglen,
-                                 void const *userdata,
-                                 size_t userlen,
-                                 Realm::Processor proc) {
+                                        size_t arglen,
+                                        void const *userdata,
+                                        size_t userlen,
+                                        Realm::Processor proc) {
   PerDeviceOpStateInitTaskArgs task_args =
       per_device_op_state_init_task_args_from_serializable(
-          deserialize_task_args<SerializablePerDeviceOpStateInitTaskArgs>(args,
-                                                                     arglen));
+          deserialize_task_args<SerializablePerDeviceOpStateInitTaskArgs>(
+              args, arglen));
 
   RealmContext ctx{proc};
   device_handle_t device_handle =
@@ -67,10 +67,10 @@ void per_device_op_state_init_task_body(void const *args,
   DeviceSpecificPtr<PerDeviceOpState> result_device_specific{
       ctx.get_current_device_idx(), result_state_ptr};
   spawn_per_device_op_state_init_return_task(ctx,
-                                      task_args.origin_proc,
-                                      result_device_specific,
-                                      task_args.origin_result_ptr,
-                                      Realm::Event::NO_EVENT);
+                                             task_args.origin_proc,
+                                             result_device_specific,
+                                             task_args.origin_result_ptr,
+                                             Realm::Event::NO_EVENT);
 }
 
 std::optional<Realm::Event> spawn_per_device_op_state_init_task(
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
index 5f0e8e5a7f..0aaa3dacae 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
@@ -3,8 +3,7 @@
 namespace FlexFlow {
 
 SerializableFfHandleInitTaskArgs
-    ff_handle_init_task_args_to_serializable(
-        FfHandleInitTaskArgs const &args) {
+    ff_handle_init_task_args_to_serializable(FfHandleInitTaskArgs const &args) {
   return SerializableFfHandleInitTaskArgs{
       /*workSpaceSize=*/args.workSpaceSize,
       /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
index fc30837325..7b52d9c03d 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
@@ -6,8 +6,9 @@
 
 namespace FlexFlow {
 
-SerializablePerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_to_serializable(
-    PerDeviceOpStateInitTaskArgs const &args) {
+SerializablePerDeviceOpStateInitTaskArgs
+    per_device_op_state_init_task_args_to_serializable(
+        PerDeviceOpStateInitTaskArgs const &args) {
   return SerializablePerDeviceOpStateInitTaskArgs{
       /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
       /*tensor_backing*/
@@ -21,8 +22,9 @@ SerializablePerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_to_s
   };
 }
 
-PerDeviceOpStateInitTaskArgs per_device_op_state_init_task_args_from_serializable(
-    SerializablePerDeviceOpStateInitTaskArgs const &args) {
+PerDeviceOpStateInitTaskArgs
+    per_device_op_state_init_task_args_from_serializable(
+        SerializablePerDeviceOpStateInitTaskArgs const &args) {
   return PerDeviceOpStateInitTaskArgs{
       /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
       /*tensor_backing*/
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index 69d5a163c8..e7a8948f8d 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -2,9 +2,9 @@
 #include "realm-execution/tasks/impl/controller_task.h"
 #include "realm-execution/tasks/impl/ff_handle_init_return_task.h"
 #include "realm-execution/tasks/impl/ff_handle_init_task.h"
+#include "realm-execution/tasks/impl/op_task.h"
 #include "realm-execution/tasks/impl/per_device_op_state_init_return_task.h"
 #include "realm-execution/tasks/impl/per_device_op_state_init_task.h"
-#include "realm-execution/tasks/impl/op_task.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "utils/exception.h"
 
@@ -54,10 +54,14 @@ Realm::Event register_all_tasks() {
   };
 
   for (task_id_t task_id : init_task_ids) {
-    pending_registrations.push_back(register_task(
-        Realm::Processor::LOC_PROC, task_id, per_device_op_state_init_task_body));
-    pending_registrations.push_back(register_task(
-        Realm::Processor::TOC_PROC, task_id, per_device_op_state_init_task_body));
+    pending_registrations.push_back(
+        register_task(Realm::Processor::LOC_PROC,
+                      task_id,
+                      per_device_op_state_init_task_body));
+    pending_registrations.push_back(
+        register_task(Realm::Processor::TOC_PROC,
+                      task_id,
+                      per_device_op_state_init_task_body));
   }
 
   std::vector<task_id_t> task_ids = {
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index d9b50b5ea7..8e5918b0f0 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -205,10 +205,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
           input_tensors;
 
-      DistributedFfHandle device_handle = create_distributed_ff_handle(
-          ctx,
-          /*workSpaceSize=*/1024 * 1024,
-          /*allowTensorOpMathConversion=*/true);
+      DistributedFfHandle device_handle =
+          create_distributed_ff_handle(ctx,
+                                       /*workSpaceSize=*/1024 * 1024,
+                                       /*allowTensorOpMathConversion=*/true);
 
       PCGInstance pcg_instance = create_pcg_instance(
           /*ctx=*/ctx,
@@ -411,8 +411,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                          /*nesterov=*/false,
                                          /*weight_decay=*/0.001}};
 
-
-//! [realm-execution example]
+    //! [realm-execution example]
     std::vector<char *> fake_args =
         make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/1_n);
     int fake_argc = fake_args.size();
@@ -432,10 +431,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
           input_tensors;
 
-      DistributedFfHandle device_handle = create_distributed_ff_handle(
-          ctx,
-          /*workSpaceSize=*/1024 * 1024,
-          /*allowTensorOpMathConversion=*/true);
+      DistributedFfHandle device_handle =
+          create_distributed_ff_handle(ctx,
+                                       /*workSpaceSize=*/1024 * 1024,
+                                       /*allowTensorOpMathConversion=*/true);
 
       PCGInstance pcg_instance = create_pcg_instance(
           /*ctx=*/ctx,
@@ -486,7 +485,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     });
 
     e.wait();
-//! [realm-execution example]
+    //! [realm-execution example]
   }
 }
 
diff --git a/lib/substitutions/index.dox b/lib/substitutions/index.dox
index fbfa4af9dd..821a81a59e 100644
--- a/lib/substitutions/index.dox
+++ b/lib/substitutions/index.dox
@@ -22,9 +22,9 @@ The input graph derived by this match is then defined by `values(node_assignment
 
 \section output-graph-expr OutputGraphExpr
 
-An \ref OutputGraphExpr is defined as an open graph with node label \ref OutputOperatorAttrAssignment and output label \ref std::monostate.
+An \ref OutputGraphExpr is defined as an open graph with node label \ref OutputOperatorAttrsAssignment and output label \c std::monostate.
 
-\ref OutputOperatorAttrAssignment is a collection of \ref OperatorAttributeKey and \ref OutputOperatorAttributeExpr pairs. It defines how the attributes of a single operator is calculated from the input graph. A pair \c "{operator_attribute_key, output_operator_attribute_expr}" in the collection means the value of \c output_operator_attribute_expr is assigned to the attribute named \c operator_attribute_key of the operator.
+\ref OutputOperatorAttrsAssignment is a collection of \ref OperatorAttributeKey and \ref OutputOperatorAttributeExpr pairs. It defines how the attributes of a single operator is calculated from the input graph. A pair \c "{operator_attribute_key, output_operator_attribute_expr}" in the collection means the value of \c output_operator_attribute_expr is assigned to the attribute named \c operator_attribute_key of the operator.
 
 */
 }
diff --git a/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h b/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
index fddad49ddf..17f59702b3 100644
--- a/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
@@ -23,11 +23,13 @@ std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x);
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 template <>
 struct hash<::FlexFlow::FwdBwdOpTaskImplFunction> {
   size_t operator()(::FlexFlow::FwdBwdOpTaskImplFunction const &) const;
 };
 } // namespace std
+///\endcond
 
 #endif
diff --git a/lib/task-spec/include/task-spec/generic_task_impl_function.h b/lib/task-spec/include/task-spec/generic_task_impl_function.h
index a4707a2f6f..c17fb62af5 100644
--- a/lib/task-spec/include/task-spec/generic_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/generic_task_impl_function.h
@@ -23,11 +23,13 @@ std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x);
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 template <>
 struct hash<::FlexFlow::GenericTaskImplFunction> {
   size_t operator()(::FlexFlow::GenericTaskImplFunction const &) const;
 };
 } // namespace std
+///\endcond
 
 #endif
diff --git a/lib/task-spec/include/task-spec/task_argument_accessor/index.dox b/lib/task-spec/include/task-spec/task_argument_accessor/index.dox
index 3f0f3acd32..9c42a19188 100644
--- a/lib/task-spec/include/task-spec/task_argument_accessor/index.dox
+++ b/lib/task-spec/include/task-spec/task_argument_accessor/index.dox
@@ -75,7 +75,7 @@ TaskArgumentAccessor is just a thin, ref-counted wrapper around the abstract ITa
 Instances of ITaskArgumentAccessor provide access to the following arguments:
 
 - One of \ref PCGOperatorAttrs, \ref LossAttrs, or \ref OptimizerAttrs depending on whether this task is for an operator, an optimizer, or a loss function.
-- Two pieces of device-specific state: \ref device_handle_t (aka \ref term-ff-handle FF handle) and \ref term-per-device-op-state PerDeviceOpState. As both of these contain pointers and hold device-specific initialization state, in distributed execution their addresses (rather than their contents) are passed around, and they are only valid on the device they originated on. One \ref term-ff-handle should be created per device, while one \ref per-device-op-state should be create for every operator for every device it runs on.
+- Two pieces of device-specific state: \ref device_handle_t (aka \ref PerDeviceFFHandle) and \ref PerDeviceOpState. As both of these contain pointers and hold device-specific initialization state, in distributed execution their addresses (rather than their contents) are passed around, and they are only valid on the device they originated on. One \ref PerDeviceFFHandle should be created per device, while one \ref PerDeviceOpState should be create for every operator for every device it runs on.
 - A few simple value types communicating runtime-wide settings: \ref ProfilingSettings, \ref DeviceType, and \ref FFIterationConfig.
 
 */
diff --git a/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc b/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
index 3450b5d268..9b040b6021 100644
--- a/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
@@ -46,9 +46,12 @@ std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 size_t hash<FlexFlow::FwdBwdOpTaskImplFunction>::operator()(
     ::FlexFlow::FwdBwdOpTaskImplFunction const &x) const {
   return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
 }
+///\endcond
+
 } // namespace std
diff --git a/lib/task-spec/src/task-spec/generic_task_impl_function.cc b/lib/task-spec/src/task-spec/generic_task_impl_function.cc
index 4abd1ab644..84bed4e9d2 100644
--- a/lib/task-spec/src/task-spec/generic_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/generic_task_impl_function.cc
@@ -45,9 +45,12 @@ std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 size_t hash<FlexFlow::GenericTaskImplFunction>::operator()(
     ::FlexFlow::GenericTaskImplFunction const &x) const {
   return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
 }
+///\endcond
+
 } // namespace std
diff --git a/lib/task-spec/src/task-spec/init_op_task_impl_function.cc b/lib/task-spec/src/task-spec/init_op_task_impl_function.cc
index 4cd55fc488..ce72dcb630 100644
--- a/lib/task-spec/src/task-spec/init_op_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/init_op_task_impl_function.cc
@@ -45,9 +45,11 @@ std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 size_t hash<FlexFlow::InitOpTaskImplFunction>::operator()(
     ::FlexFlow::InitOpTaskImplFunction const &x) const {
   return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
 }
 } // namespace std
+///\endcond
diff --git a/lib/utils/include/utils/any_value_type/any_value_type.h b/lib/utils/include/utils/any_value_type/any_value_type.h
index a99ce5c8f0..fc4d6b488d 100644
--- a/lib/utils/include/utils/any_value_type/any_value_type.h
+++ b/lib/utils/include/utils/any_value_type/any_value_type.h
@@ -34,7 +34,9 @@ struct any_value_type {
   std::function<size_t(std::any const &)> hash;
   std::function<std::string(std::any const &)> to_string;
 
+  ///\cond
   friend std::hash<any_value_type>;
+  ///\endcond
 };
 
 template <typename T>
@@ -54,6 +56,7 @@ any_value_type make_any_value_type(T const &t) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 
 template <>
@@ -62,5 +65,6 @@ struct hash<::FlexFlow::any_value_type> {
 };
 
 } // namespace std
+///\endcond
 
 #endif
diff --git a/lib/utils/include/utils/graph/index.dox b/lib/utils/include/utils/graph/index.dox
index 355117497f..75793b2ed4 100644
--- a/lib/utils/include/utils/graph/index.dox
+++ b/lib/utils/include/utils/graph/index.dox
@@ -104,21 +104,21 @@ The last paragraph covered the base API used to write to graphs, but we also wan
 Reading from graphs is implemented with the \c query_nodes and \c query_edges methods, which can be thought of as executing a database query over the nodes and edges of the target graph, respectively (where queries are restricted to an incredibly simple set of operations).
 The argument to \c query_nodes is a \ref NodeQuery (which is simply a set of \ref Node ""s).
 \c query_nodes then returns the intersection of the nodes in the graph and the nodes in the query.
-The set of nodes in the query is actually a \ref std::optional, so \ref std::nullopt could also be passed, which would simply retrieve all nodes from the target graph (essentially \ref std::nullopt acts as the set of all nodes that could ever exist).
+The set of nodes in the query is actually a \c std::optional, so \c std::nullopt could also be passed, which would simply retrieve all nodes from the target graph (essentially \c std::nullopt acts as the set of all nodes that could ever exist).
 \c query_edges functions similarly, but as with \c add_edge its behavior is differs slightly between the three graph variants.
 \ref UndirectedGraph::query_edges simply takes an optional set of nodes and returns all edges that touch any of those nodes.
 \ref DiGraph::query_edges allows separate sets for source and destination nodes, and \ref MultiDiGraph::query_edges adds the ability to filter by source and destination indices as well.
 
 In practice you will rarely ever use \c query_nodes and \c query_edges as the graph library provides a large number of algorithms that do that work for you, but it can be helpful to understand this base layer if you ever need to implement your own algorithms.
 The layer users will most commonly interact with is the interface provided within either the \c algorithms.h header files or the \c algorithms folders, present in their respective graph class folders.
-They provide a large number of pre-implemented algorithms on graphs, ranging from as simple as \ref get_nodes to as complex as \ref get_transitive_reduction and \ref get_dominators.
+They provide a large number of pre-implemented algorithms on graphs, ranging from as simple as \ref get_nodes to as complex as \ref transitive_reduction and \ref get_dominators.
 Note that, due to the internal virtual inheritance structure, some functions for more privitive classes can be employed by the derived classes. (For example, `get_nodes` present in `node/algorithms.h` can be used by \ref DiGraph).
 You may notice that the most of algorithms present take as arguments not \ref UndirectedGraph, \ref DiGraph, and \ref MultiDiGraph, but rather \ref UndirectedGraphView, \ref DiGraphView, and \ref MultiDiGraphView.
-These \ref GraphView objects represent read-only (i.e., immutable) graphs.
-Similar to C++'s \c const semantics, \ref Graph ""s can be coerced \ref graph-footnote-2 "[2]" to \ref GraphView ""s but not the other way around.
-To transform a \ref GraphView to a \ref Graph, we can perform an explicit copy with \ref materialize_view.
-Both \ref Graph and \ref GraphView types follow normal value semantics.
-This may seem wasteful (oftentimes graphs are large objects that are passed around via reference to avoid making additional copies), but the \ref Graph and \ref GraphView types internally implement copy-on-write optimizations to only perform the minimum number of actual copies while maintaining immutability and lifetime safety (if you allocate a \ref DiGraph use for example \ref "get_subgraph(DiGraphView const &, std::unordered_set<Node> const *)" "get_subgraph" to get a \ref DiGraphView representing a part of this graph, modifications to the underlying \ref DiGraph will not be mirrored in the \ref DiGraphView and the \ref DiGraphView will remain valid even after the base \ref DiGraph leaves scope.
+These <em>GraphView</em> objects represent read-only (i.e., immutable) graphs.
+Similar to C++'s \c const semantics, <em>Graphs</em> can be coerced \ref graph-footnote-2 "[2]" to <em>GraphViews</em>, but not the other way around.
+To transform a <em>GraphView</em> (e.g., \ref DiGraphView) to a <em>Graph</em> (e.g., \ref DiGraph), we can perform an explicit copy with a <em>materialize function</em> (e.g., \ref materialize_digraph_view).
+Both <em>Graph</em> and <em>GraphView</em> types follow normal value semantics.
+This may seem wasteful (oftentimes graphs are large objects that are passed around via reference to avoid making additional copies), but the <em>Graph</em> and <em>GraphView</em> types internally implement copy-on-write optimizations to only perform the minimum number of actual copies while maintaining immutability and lifetime safety (if you allocate a \ref DiGraph use for example \ref "get_subgraph(DiGraphView const &, std::unordered_set<Node> const &)" "get_subgraph" to get a \ref DiGraphView representing a part of this graph, modifications to the underlying \ref DiGraph will not be mirrored in the \ref DiGraphView and the \ref DiGraphView will remain valid even after the base \ref DiGraph leaves scope.
 
 At this point, however, we still have not discussed how to create a graph.
 The user-facing graph interface is intentionally separated from the underlying graph representations, so representations can be changed without requiring any user-side code modifications besides the choice of which implementation to use.
@@ -132,7 +132,7 @@ Generally users will use underlying representations provided by the graph librar
 
 \subsection dataflow-graph DataflowGraph
 
-The primary abstraction for representing computation graphs / task graphs is the \ref DataflowGraph interface (along with its variants, \ref OpenDataflowGraph, \ref LabelleledDataflowGraph and \ref OpenLabelleledDataflowGraph).
+The primary abstraction for representing computation graphs / task graphs is the \ref DataflowGraph interface (along with its variants, \ref OpenDataflowGraph, \ref LabelledDataflowGraph and \ref LabelledOpenDataflowGraph).
 At a high level, nodes represent multivariate functions (from tuples of inputs to tuple of outputs), while edges represent value uses of such functions.
 
 \ref DataflowGraph is similar to \ref MultiDiGraph, but with the following important differences:
@@ -141,10 +141,10 @@ At a high level, nodes represent multivariate functions (from tuples of inputs t
   - \ref DataflowGraph ""s are directed acyclic graphs. This is enforced by the interface used to construct them, since a node can only be added to the graph after all of its predecessor nodes have already been added.
 
 The main components of \ref DataflowGraph are as follows:
-- \ref DataflowInput: used to denote an entry in the ordered sequence of incoming dependencies (arguments) of a given node (operator).
-- \ref DataflowOutput: used to denote an entry in the ordered sequence of outgoing results (value uses) from a given node (operator).
-- \ref DataflowEdge: wrapper around a \ref DataflowInput, \ref DataflowOutput pair between 2 nodes.
-- \ref NodeAddedResult "": returned upon adding a new node. Contains the newly generated \ref Node and the \ref std::vector of \ref DataflowOutput ""s for the given node.
+- \ref "DataflowInput": used to denote an entry in the ordered sequence of incoming dependencies (arguments) of a given node (operator).
+- \ref "DataflowOutput": used to denote an entry in the ordered sequence of outgoing results (value uses) from a given node (operator).
+- \ref "DataflowEdge": wrapper around a \ref DataflowInput, \ref DataflowOutput pair between 2 nodes.
+- \ref "NodeAddedResult": returned upon adding a new node. Contains the newly generated \ref Node and the \c std::vector of \ref DataflowOutput ""s for the given node.
 
 \ref DataflowGraph ""s are constructed as follows:
 
@@ -196,20 +196,20 @@ digraph {
 
 "Open" should be interpreted in the topological sense: that is, a graph that contains some edges where one of the edge's 2 nodes is not present in the graph itself.
 This graph class is particularly useful for processing a sub-graph of a given graph while still maintaining information regarding the edges that cross the cut.
-\ref DataflowGraphInput is used to represent the open (incoming) inputs to the graph. Note that, unlike \ref DataFlowInput, \ref DataflowGraphInput ""s are unordered (given that they are inputs to possibly several different nodes within the graph).
+\ref DataflowGraphInput is used to represent the open (incoming) inputs to the graph. Note that, unlike \ref DataflowInput, \ref DataflowGraphInput ""s are unordered (given that they are inputs to possibly several different nodes within the graph).
 
 \subsection labelled-dataflow-variant Labelled Dataflow Variant
 
 As nice as all of the above is, graphs without labels are mostly useless--in practice, nodes and edges represent some other system and the properties of that system (or at least a way to map the result of graph algorithms back to the underlying system) are necessary.
-Thus, FlexFlow's graph library provides the ability to add labels to \ref DataflowGraph, through the \ref LabelleledDataflowGraph and \ref OpenLabelleledDataflowGraph, which allow users to label different components of the graph.
+Thus, FlexFlow's graph library provides the ability to add labels to \ref DataflowGraph, through the \ref LabelledDataflowGraph and \ref LabelledOpenDataflowGraph, which allow users to label different components of the graph.
 - \ref LabelledDataflowGraph allows for labelling of \ref Node ""s and \ref DataflowOutput ""s.
-- \ref OpenLabelledDataflowGraph allows for labelling of \ref Node ""s and \ref OpenDataflowValue ""s, which is a variant describing both \ref DataflowOutput ""s and \ref DataflowGraphInput ""s.
+- \ref LabelledOpenDataflowGraph allows for labelling of \ref Node ""s and \ref OpenDataflowValue ""s, which is a variant describing both \ref DataflowOutput ""s and \ref DataflowGraphInput ""s.
 
 While the interfaces of these graphs differ slightly from the core graph variants, they still have the corresponding \ref LabelledDataflowGraph::add_node methods, and \ref LabelledDataflowGraph::query_nodes / \ref LabelledDataflowGraph::query_edges methods. (Note that there is no \c add_edge method since, for \ref DataflowGraph, edges are implicitly added when we add a node and specify its predecessors)
 Note that all of the labelled graph types require that each element of the labelled types have a label, which is enforced via the interfaces they provide.
-Partial labelling can be implement via wrapping the label type in \ref std::optional.
+Partial labelling can be implement via wrapping the label type in \c std::optional.
 Interacting with \c Node and \c Edge objects is still necessary to use the labelled graph types: intuitively the labelled graph types can be thought of as a pair of a core graph variant and a hash map the maps nodes/edges to labels.
-As such, the labelled graph types provide the typical \ref LabelledDataflowGraph::at method (as on \ref std::unordered_map \ref graph-footnote-3 "[3]") and can be coerced to their underlying core graph variants.
+As such, the labelled graph types provide the typical \ref LabelledDataflowGraph::at method (as on \c std::unordered_map \ref graph-footnote-3 "[3]") and can be coerced to their underlying core graph variants.
 
 \section graph-internals Internals
 
diff --git a/lib/utils/src/utils/any_value_type/any_value_type.cc b/lib/utils/src/utils/any_value_type/any_value_type.cc
index d4c605c441..0e55967e05 100644
--- a/lib/utils/src/utils/any_value_type/any_value_type.cc
+++ b/lib/utils/src/utils/any_value_type/any_value_type.cc
@@ -24,11 +24,13 @@ std::string format_as(any_value_type const &v) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 
 size_t hash<::FlexFlow::any_value_type>::operator()(
     ::FlexFlow::any_value_type const &v) const {
   return v.hash(v);
 }
+///\endcond
 
 } // namespace std
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
index f75145b6af..72c2d9d3c7 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
@@ -22,8 +22,7 @@ std::string as_dot(OpenDataflowGraphView const &g) {
   return as_dot(g, get_node_label, get_input_label);
 }
 
-/**
- * WARN(@lockshaw): doing this all with string ids is ugly and error prone,
+/* WARN(@lockshaw): doing this all with string ids is ugly and error prone,
  * as it requires duplicating the stringification logic across functions.
  *
  * Fixing this is tracked in issue
diff --git a/lib/utils/src/utils/graph/series_parallel/graph_generation.cc b/lib/utils/src/utils/graph/series_parallel/graph_generation.cc
index 1390b51db5..e60db0a87f 100644
--- a/lib/utils/src/utils/graph/series_parallel/graph_generation.cc
+++ b/lib/utils/src/utils/graph/series_parallel/graph_generation.cc
@@ -13,8 +13,7 @@ void parallel_extend_unsafe(DataflowGraph &g, DataflowGraphView const &ext) {
 }
 
 void series_extend_unsafe(DataflowGraph &g, DataflowGraphView const &ext) {
-  /**
-   * TODO(@lockshaw): This function signature is impossible to implement in
+  /* TODO(@lockshaw): This function signature is impossible to implement in
    * general, as there is no guarantee that the graph view ext actually has
    * source nodes with inputs Either the signature should be changed, or an
    * implementation should be added that throws an error if this problematic
diff --git a/lib/utils/src/utils/graph/traversal.cc b/lib/utils/src/utils/graph/traversal.cc
index fff5a0f958..a4df327b2a 100644
--- a/lib/utils/src/utils/graph/traversal.cc
+++ b/lib/utils/src/utils/graph/traversal.cc
@@ -6,32 +6,27 @@
 
 namespace FlexFlow {
 
-using cdi = checked_dfs_iterator;
-using udi = unchecked_dfs_iterator;
-using bfi = bfs_iterator;
-/* using bdi = BoundaryDFSView::boundary_dfs_iterator; */
-
-udi::unchecked_dfs_iterator(DiGraphView const &g,
-                            std::vector<Node> const &stack)
+unchecked_dfs_iterator::unchecked_dfs_iterator(DiGraphView const &g,
+                                               std::vector<Node> const &stack)
     : stack(stack), graph(g) {}
 
-udi::unchecked_dfs_iterator(DiGraphView const &g,
-                            std::unordered_set<Node> const &starting_points)
+unchecked_dfs_iterator::unchecked_dfs_iterator(
+    DiGraphView const &g, std::unordered_set<Node> const &starting_points)
     : graph(g) {
   for (Node const &n : starting_points) {
     this->stack.push_back(n);
   }
 }
 
-udi::reference udi::operator*() const {
+unchecked_dfs_iterator::reference unchecked_dfs_iterator::operator*() const {
   return this->stack.back();
 }
 
-udi::pointer udi::operator->() {
+unchecked_dfs_iterator::pointer unchecked_dfs_iterator::operator->() {
   return &this->operator*();
 }
 
-udi &udi::operator++() {
+unchecked_dfs_iterator &unchecked_dfs_iterator::operator++() {
   Node const last = this->operator*();
   this->stack.pop_back();
 
@@ -48,41 +43,43 @@ udi &udi::operator++() {
   return *this;
 }
 
-void udi::skip() {
+void unchecked_dfs_iterator::skip() {
   this->stack.pop_back();
 }
 
-udi udi::operator++(int) {
+unchecked_dfs_iterator unchecked_dfs_iterator::operator++(int) {
   auto tmp = *this;
   ++(*this);
   return tmp;
 }
 
-bool udi::operator==(udi const &other) const {
+bool unchecked_dfs_iterator::operator==(
+    unchecked_dfs_iterator const &other) const {
   return this->stack == other.stack;
 }
 
-bool udi::operator!=(udi const &other) const {
+bool unchecked_dfs_iterator::operator!=(
+    unchecked_dfs_iterator const &other) const {
   return this->stack != other.stack;
 }
 
-cdi::checked_dfs_iterator(DiGraphView const &g,
-                          std::vector<Node> const &stack,
-                          std::unordered_set<Node> const &seen)
+checked_dfs_iterator::checked_dfs_iterator(DiGraphView const &g,
+                                           std::vector<Node> const &stack,
+                                           std::unordered_set<Node> const &seen)
     : iter(g, stack), seen(seen) {}
 
-cdi::checked_dfs_iterator(DiGraphView const &g,
-                          std::unordered_set<Node> const &starting_points)
+checked_dfs_iterator::checked_dfs_iterator(
+    DiGraphView const &g, std::unordered_set<Node> const &starting_points)
     : iter(g, starting_points), seen{} {}
 
-cdi::reference cdi::operator*() const {
+checked_dfs_iterator::reference checked_dfs_iterator::operator*() const {
   return this->iter.operator*();
 }
-cdi::pointer cdi::operator->() {
+checked_dfs_iterator::pointer checked_dfs_iterator::operator->() {
   return this->iter.operator->();
 }
 
-cdi &cdi::operator++() {
+checked_dfs_iterator &checked_dfs_iterator::operator++() {
   this->seen.insert(*iter);
   this->iter++;
   while (contains(this->seen, *iter)) {
@@ -91,42 +88,42 @@ cdi &cdi::operator++() {
   return *this;
 }
 
-cdi cdi::operator++(int) {
+checked_dfs_iterator checked_dfs_iterator::operator++(int) {
   auto tmp = *this;
   ++(*this);
   return tmp;
 }
 
-bool cdi::operator==(cdi const &other) const {
+bool checked_dfs_iterator::operator==(checked_dfs_iterator const &other) const {
   return this->iter == other.iter && this->seen == other.seen;
 }
 
-bool cdi::operator!=(cdi const &other) const {
+bool checked_dfs_iterator::operator!=(checked_dfs_iterator const &other) const {
   return this->iter != other.iter && this->seen != other.seen;
 }
 
-bfi::bfs_iterator(DiGraphView const &g,
-                  std::queue<Node> const &q,
-                  std::optional<std::unordered_set<Node>> const &seen)
+bfs_iterator::bfs_iterator(DiGraphView const &g,
+                           std::queue<Node> const &q,
+                           std::optional<std::unordered_set<Node>> const &seen)
     : graph(g), q(q), seen(seen) {}
 
-bfi::bfs_iterator(DiGraphView const &g,
-                  std::unordered_set<Node> const &starting_points)
+bfs_iterator::bfs_iterator(DiGraphView const &g,
+                           std::unordered_set<Node> const &starting_points)
     : graph(g), seen(std::unordered_set<Node>{}) {
   for (Node const &n : starting_points) {
     this->q.push(n);
   }
 }
 
-bfi::reference bfi::operator*() const {
+bfs_iterator::reference bfs_iterator::operator*() const {
   return this->q.front();
 }
 
-bfi::pointer bfi::operator->() {
+bfs_iterator::pointer bfs_iterator::operator->() {
   return &this->operator*();
 }
 
-bfi &bfi::operator++() {
+bfs_iterator &bfs_iterator::operator++() {
   Node current = this->operator*();
   assert(this->seen.has_value());
   this->seen.value().insert(current);
@@ -147,20 +144,20 @@ bfi &bfi::operator++() {
   return *this;
 }
 
-bfi bfi::operator++(int) {
+bfs_iterator bfs_iterator::operator++(int) {
   auto tmp = *this;
   ++(*this);
   return tmp;
 }
 
-bool bfi::operator==(bfi const &other) const {
+bool bfs_iterator::operator==(bfs_iterator const &other) const {
   return this->q == other.q &&
          (!this->seen.has_value() || !other.seen.has_value() ||
           this->seen == other.seen) &&
          is_ptr_equal(this->graph, other.graph);
 }
 
-bool bfi::operator!=(bfi const &other) const {
+bool bfs_iterator::operator!=(bfs_iterator const &other) const {
   return this->q != other.q ||
          (this->seen.has_value() && other.seen.has_value() &&
           this->seen != other.seen) &&
diff --git a/lib/utils/src/utils/half.cc b/lib/utils/src/utils/half.cc
index 3dbea5c4dc..7a4415ab62 100644
--- a/lib/utils/src/utils/half.cc
+++ b/lib/utils/src/utils/half.cc
@@ -1,6 +1,7 @@
 #include "utils/half.h"
 #include "utils/hash-utils.h"
 
+///\cond
 namespace std {
 
 size_t hash<half>::operator()(half h) const {
@@ -8,3 +9,4 @@ size_t hash<half>::operator()(half h) const {
 }
 
 } // namespace std
+///\endcond

From 416eb21d0dfd2f6dc14b8827a826836cd2a0daf7 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 11 Mar 2026 15:50:08 -0700
Subject: [PATCH 100/113] Update docs on Realm paired task structure.

---
 .../include/realm-execution/tasks/impl/index.dox                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
index 9f9b467e46..910488a863 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
@@ -22,7 +22,7 @@ Implements all of the operator tasks, i.e., the tasks that are executed during t
 
 The other two types of tasks are implemented as pairs of tasks: one to begin initializing a value (e.g., \ref spawn_ff_handle_init_task), and another to return the initialized value when it's ready (e.g., \ref spawn_ff_handle_init_return_task). As with \ref tasks-one-part, they have an optional corresponding tasks argument type and a potential serializable task argument type.
 
-\todo \@Elliott why is the paired tasks structure required? Is it a performance optimization, or simply necessary given the set of primitives Realm provides?
+The paired task structure is required because Realm tasks do not return. Spawning a Realm task returns a completion event, but the event does not encode any information (other than that the task is finished). Thus, to return a value to the caller, a second task is required to send the value back, and the caller must block for this task to complete to ensure that the data is available before proceeding.
 
 \subsection tasks-ffhandle-init FFHandle Initialization Tasks
 

From a5c9a9913474b1b0c521160a92253618745b3a6a Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 10:29:44 -0700
Subject: [PATCH 101/113] Fix data corruption in controller task args.

---
 .../include/realm-execution/realm_manager.h   |   3 +-
 .../tasks/impl/controller_task.h              |  38 ++++-
 .../tasks/impl/controller_task_args.dtg.toml  |  13 ++
 .../src/realm-execution/realm_manager.cc      |   3 +-
 .../tasks/impl/controller_task.cc             |  50 ++++---
 .../test/src/realm-execution/realm_manager.cc |  13 +-
 .../test/src/realm-execution/test_e2e.cc      | 131 +++++++++---------
 7 files changed, 159 insertions(+), 92 deletions(-)
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/impl/controller_task_args.dtg.toml

diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 287218749e..22e8bd2e80 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -6,6 +6,7 @@
 #include "pcg/device_id_t.dtg.h"
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
+#include "realm-execution/tasks/impl/controller_task.h"
 
 namespace FlexFlow {
 
@@ -32,7 +33,7 @@ struct RealmManager : private RealmContext {
    * on the resulting event** to ensure it remains in scope until the controller
    * completes.
    */
-  [[nodiscard]] Realm::Event
+  [[nodiscard]] ControllerTaskResult
       start_controller(std::function<void(RealmContext &)>,
                        Realm::Event wait_on = Realm::Event::NO_EVENT);
 };
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
index 7219c5c07f..abbd4019ae 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
@@ -3,9 +3,45 @@
 
 #include "realm-execution/realm.h"
 #include "realm-execution/realm_context.h"
+#include "realm-execution/tasks/impl/controller_task_args.dtg.h"
+#include <memory>
 
 namespace FlexFlow {
 
+/**
+ * \brief Holds the result of launching a controller task via \ref
+ * collective_spawn_controller_task. Owns the heap-allocated \ref
+ * ControllerTaskArgs so they remain valid until the task completes. The
+ * destructor automatically waits for the controller to finish before freeing
+ * the args, preventing use-after-free while the controller is running.
+ *
+ * \note Users must explicitly block by waiting on the result before mutating
+ * the contents of any values captured (e.g., by closure) in the controller.
+ * Otherwise the controller may race with the caller.
+ */
+struct ControllerTaskResult {
+public:
+  explicit ControllerTaskResult(std::unique_ptr<ControllerTaskArgs> args,
+                                Realm::Event event);
+
+  ControllerTaskResult(ControllerTaskResult const &) = delete;
+  ControllerTaskResult(ControllerTaskResult &&) = delete;
+  ControllerTaskResult &operator=(ControllerTaskResult const &) = delete;
+  ControllerTaskResult &operator=(ControllerTaskResult &&) = delete;
+
+  /**
+   * \brief Block until the controller task completes. Must be called before
+   * mutating any data captured by the controller thunk to avoid data races.
+   */
+  void wait();
+
+  ~ControllerTaskResult();
+
+private:
+  std::unique_ptr<ControllerTaskArgs> args;
+  Realm::Event event;
+};
+
 /**
  * \brief A stub function to work around Realm not allowing lambdas to be be
  * registered as Realm tasks. Takes the desired lambda to run as the \ref
@@ -18,7 +54,7 @@ void controller_task_body(
  * \brief Dispatches the \ref term-controller task. Packages up the provided \c
  * std::function and passes it along to \ref controller_task_body.
  */
-Realm::Event
+ControllerTaskResult
     collective_spawn_controller_task(RealmContext &ctx,
                                      Realm::Processor &target_proc,
                                      std::function<void(RealmContext &)> thunk,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task_args.dtg.toml
new file mode 100644
index 0000000000..0c0bd7b96c
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task_args.dtg.toml
@@ -0,0 +1,13 @@
+namespace = "FlexFlow"
+name = "ControllerTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "realm-execution/realm_context.h",
+  "functional",
+]
+
+[[fields]]
+name = "thunk"
+type = "std::function<void(::FlexFlow::RealmContext &)>"
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index fc74fffe5d..5dbe4e91ae 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -1,6 +1,5 @@
 #include "realm-execution/realm_manager.h"
 #include "realm-execution/realm_context.h"
-#include "realm-execution/tasks/impl/controller_task.h"
 #include "realm-execution/tasks/realm_task_registry.h"
 
 namespace FlexFlow {
@@ -20,7 +19,7 @@ RealmManager::~RealmManager() {
   this->runtime.wait_for_shutdown();
 }
 
-Realm::Event
+ControllerTaskResult
     RealmManager::start_controller(std::function<void(RealmContext &)> thunk,
                                    Realm::Event wait_on) {
   Realm::Processor target_proc =
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
index 285e8acaa7..925402bcb4 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
@@ -1,39 +1,55 @@
+#include "realm-execution/tasks/impl/controller_task.h"
 #include "realm-execution/tasks/impl/op_task.h"
 #include "realm-execution/tasks/task_id_t.h"
+#include <cstring>
 
 namespace FlexFlow {
 
-struct ControllerTaskArgs {
-public:
-  std::function<void(RealmContext &)> thunk;
-};
+ControllerTaskResult::ControllerTaskResult(
+    std::unique_ptr<ControllerTaskArgs> args, Realm::Event event)
+    : args(std::move(args)), event(event) {}
+
+void ControllerTaskResult::wait() {
+  this->event.wait();
+}
+
+ControllerTaskResult::~ControllerTaskResult() {
+  if (this->args != nullptr) {
+    this->event.wait();
+  }
+}
 
 void controller_task_body(void const *args,
                           size_t arglen,
                           void const *userdata,
                           size_t userlen,
                           Realm::Processor proc) {
-  ASSERT(arglen == sizeof(ControllerTaskArgs));
-  ControllerTaskArgs task_args =
-      *reinterpret_cast<ControllerTaskArgs const *>(args);
+  ASSERT(arglen == sizeof(ControllerTaskArgs *));
+  ControllerTaskArgs *task_args_ptr;
+  std::memcpy(&task_args_ptr, args, sizeof(ControllerTaskArgs *));
 
   RealmContext ctx{proc};
-  task_args.thunk(ctx);
+  task_args_ptr->thunk(ctx);
 }
 
-Realm::Event
+ControllerTaskResult
     collective_spawn_controller_task(RealmContext &ctx,
                                      Realm::Processor &target_proc,
                                      std::function<void(RealmContext &)> thunk,
                                      Realm::Event precondition) {
-  ControllerTaskArgs task_args;
-  task_args.thunk = thunk;
-
-  return ctx.collective_spawn_task(target_proc,
-                                   task_id_t::CONTROLLER_TASK_ID,
-                                   &task_args,
-                                   sizeof(task_args),
-                                   precondition);
+  // ControllerTaskArgs are passed by pointer because they are NOT trivially
+  // copyable, and we use the ControllerTaskResult to manage the lifetime of the
+  // data to avoid use-after-free while the controller runs
+  ControllerTaskArgs *raw_ptr = new ControllerTaskArgs{thunk};
+
+  Realm::Event event = ctx.collective_spawn_task(target_proc,
+                                                 task_id_t::CONTROLLER_TASK_ID,
+                                                 &raw_ptr,
+                                                 sizeof(raw_ptr),
+                                                 precondition);
+
+  return ControllerTaskResult{std::unique_ptr<ControllerTaskArgs>(raw_ptr),
+                              event};
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
index 4063ec32f2..d0943ce12c 100644
--- a/lib/realm-execution/test/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -20,13 +20,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // Launch a controller
     int some_data = 123;
-    Realm::Event event = manager.start_controller([&](RealmContext &ctx) {
-      // Data is captured and retains value
-      ASSERT(some_data == 123);
-    });
-    // Need to block on the completion of the event to ensure we don't race,
+    ControllerTaskResult result =
+        manager.start_controller([&](RealmContext &ctx) {
+          // Data is captured and retains value
+          ASSERT(some_data == 123);
+        });
+    // Need to block on the completion of the task to ensure we don't race,
     // because the lambda captures the environment
-    event.wait();
+    result.wait();
   }
 }
 
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 8e5918b0f0..a700123c67 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -419,72 +419,73 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     RealmManager manager(&fake_argc, &fake_argv);
 
-    Realm::Event e = manager.start_controller([&](RealmContext &ctx) {
-      Allocator allocator = ctx.get_current_device_allocator();
-
-      GenericTensorAccessorW label_tensor_backing =
-          allocator.allocate_tensor(output_tensor_shape);
-
-      GenericTensorAccessorW label_tensor =
-          allocator.allocate_tensor(label_tensor_shape);
-
-      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
-          input_tensors;
-
-      DistributedFfHandle device_handle =
-          create_distributed_ff_handle(ctx,
-                                       /*workSpaceSize=*/1024 * 1024,
-                                       /*allowTensorOpMathConversion=*/true);
-
-      PCGInstance pcg_instance = create_pcg_instance(
-          /*ctx=*/ctx,
-          /*mpcg=*/mpcg,
-          /*optimizer=*/optimizer_attrs,
-          /*loss=*/loss_attrs,
-          /*label_tensor=*/label_tensor,
-          /*logit_tensor=*/t_linear_2,
-          /*loss_mapping=*/loss_mapping,
-          /*input_tensors=*/input_tensors,
-          /*profiling_settings=*/ProfilingSettings{0, 0},
-          /*device_handle=*/device_handle,
-          /*iteration_config=*/FFIterationConfig{1_p});
-
-      // begin training loop
-      int num_epochs = 5;
-      std::vector<GenericTensorAccessorR> loss_values;
-
-      for (int i = 0; i < num_epochs; i++) {
-        perform_all_passes_for_pcg_instance(
-            /*instance=*/pcg_instance,
-            /*profiling_settings=*/ProfilingSettings{0, 0},
-            /*device_handle=*/device_handle,
-            /*iteration_config=*/FFIterationConfig{1_p});
-        loss_values.push_back(copy_tensor_accessor_r(
-            dynamic_tensor_accessor_from_instance(
-                pcg_instance.get_loss_tensor_instance().value(),
-                Realm::Event::NO_EVENT,
-                lift_to_parallel(
-                    TensorShape{TensorDims{FFOrdered{output_dim, hidden_dim}},
-                                DataType::FLOAT}),
-                Permissions::RO,
-                ctx.get_current_processor())
-                .require_read(),
-            allocator));
-      }
-
-      // Assert that each sample in the batch has a lower loss in last epoch
-      // than the first epoch
-      GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
-      GenericTensorAccessorR last_epoch_loss = loss_values.back();
-      CHECK_MESSAGE(
-          did_loss_decrease(first_epoch_loss, last_epoch_loss, allocator),
-          check_kv("first_epoch_loss",
-                   format_accessor_r_contents(first_epoch_loss)),
-          check_kv("last_epoch_loss",
-                   format_accessor_r_contents(last_epoch_loss)));
-    });
+    ControllerTaskResult result =
+        manager.start_controller([&](RealmContext &ctx) {
+          Allocator allocator = ctx.get_current_device_allocator();
+
+          GenericTensorAccessorW label_tensor_backing =
+              allocator.allocate_tensor(output_tensor_shape);
+
+          GenericTensorAccessorW label_tensor =
+              allocator.allocate_tensor(label_tensor_shape);
+
+          std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+              input_tensors;
+
+          DistributedFfHandle device_handle = create_distributed_ff_handle(
+              ctx,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+
+          PCGInstance pcg_instance = create_pcg_instance(
+              /*ctx=*/ctx,
+              /*mpcg=*/mpcg,
+              /*optimizer=*/optimizer_attrs,
+              /*loss=*/loss_attrs,
+              /*label_tensor=*/label_tensor,
+              /*logit_tensor=*/t_linear_2,
+              /*loss_mapping=*/loss_mapping,
+              /*input_tensors=*/input_tensors,
+              /*profiling_settings=*/ProfilingSettings{0, 0},
+              /*device_handle=*/device_handle,
+              /*iteration_config=*/FFIterationConfig{1_p});
+
+          // begin training loop
+          int num_epochs = 5;
+          std::vector<GenericTensorAccessorR> loss_values;
+
+          for (int i = 0; i < num_epochs; i++) {
+            perform_all_passes_for_pcg_instance(
+                /*instance=*/pcg_instance,
+                /*profiling_settings=*/ProfilingSettings{0, 0},
+                /*device_handle=*/device_handle,
+                /*iteration_config=*/FFIterationConfig{1_p});
+            loss_values.push_back(copy_tensor_accessor_r(
+                dynamic_tensor_accessor_from_instance(
+                    pcg_instance.get_loss_tensor_instance().value(),
+                    Realm::Event::NO_EVENT,
+                    lift_to_parallel(TensorShape{
+                        TensorDims{FFOrdered{output_dim, hidden_dim}},
+                        DataType::FLOAT}),
+                    Permissions::RO,
+                    ctx.get_current_processor())
+                    .require_read(),
+                allocator));
+          }
+
+          // Assert that each sample in the batch has a lower loss in last epoch
+          // than the first epoch
+          GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+          GenericTensorAccessorR last_epoch_loss = loss_values.back();
+          CHECK_MESSAGE(
+              did_loss_decrease(first_epoch_loss, last_epoch_loss, allocator),
+              check_kv("first_epoch_loss",
+                       format_accessor_r_contents(first_epoch_loss)),
+              check_kv("last_epoch_loss",
+                       format_accessor_r_contents(last_epoch_loss)));
+        });
 
-    e.wait();
+    result.wait();
     //! [realm-execution example]
   }
 }

From 371ec0caf50f6c2f936635388730b22061a71cb9 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 10:34:15 -0700
Subject: [PATCH 102/113] Make sure copied task arguments are actually
 trivially copyable.

---
 .../src/realm-execution/tasks/impl/ff_handle_init_return_task.cc | 1 +
 .../tasks/impl/per_device_op_state_init_return_task.cc           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
index 1a90052fa7..dde41d0838 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
@@ -18,6 +18,7 @@ struct FfHandleInitReturnTaskArgs {
   Realm::Processor origin_proc;
   DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr;
 };
+static_assert(std::is_trivially_copyable_v<FfHandleInitReturnTaskArgs>);
 
 void ff_handle_init_return_task_body(void const *args,
                                      size_t arglen,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
index 222ddb28b8..01de48cdc6 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
@@ -18,6 +18,7 @@ struct PerDeviceOpStateInitReturnTaskArgs {
   Realm::Processor origin_proc;
   DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr;
 };
+static_assert(std::is_trivially_copyable_v<PerDeviceOpStateInitReturnTaskArgs>);
 
 void per_device_op_state_init_return_task_body(void const *args,
                                                size_t arglen,

From b76ea0e16137e604a54ce4d0381f5e4e567b2bd7 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 11:00:37 -0700
Subject: [PATCH 103/113] Update device handle function naming.

---
 lib/kernels/include/kernels/device_handle_t.h                 | 4 ++--
 lib/kernels/src/kernels/device_handle_t.cc                    | 4 ++--
 lib/kernels/test/src/kernels/linear_kernels.cc                | 4 ++--
 .../device_specific_managed_per_device_ff_handle.cc           | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/kernels/include/kernels/device_handle_t.h b/lib/kernels/include/kernels/device_handle_t.h
index 0836503717..a4df2ad753 100644
--- a/lib/kernels/include/kernels/device_handle_t.h
+++ b/lib/kernels/include/kernels/device_handle_t.h
@@ -6,10 +6,10 @@
 
 namespace FlexFlow {
 
-device_handle_t device_handle_t_from_managed_handle(
+device_handle_t device_handle_t_from_managed_ff_handle(
     std::optional<ManagedPerDeviceFFHandle> const &managed_handle);
 
-device_handle_t device_handle_t_from_managed_handle_ptr(
+device_handle_t device_handle_t_from_managed_ff_handle_ptr(
     std::optional<ManagedPerDeviceFFHandle *> const &managed_handle);
 
 device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle);
diff --git a/lib/kernels/src/kernels/device_handle_t.cc b/lib/kernels/src/kernels/device_handle_t.cc
index 0225ee8e94..bb1a2a5ba4 100644
--- a/lib/kernels/src/kernels/device_handle_t.cc
+++ b/lib/kernels/src/kernels/device_handle_t.cc
@@ -2,7 +2,7 @@
 
 namespace FlexFlow {
 
-device_handle_t device_handle_t_from_managed_handle(
+device_handle_t device_handle_t_from_managed_ff_handle(
     std::optional<ManagedPerDeviceFFHandle> const &managed_handle) {
   if (managed_handle.has_value()) {
     return gpu_make_device_handle_t(managed_handle.value().raw_handle());
@@ -11,7 +11,7 @@ device_handle_t device_handle_t_from_managed_handle(
   }
 }
 
-device_handle_t device_handle_t_from_managed_handle_ptr(
+device_handle_t device_handle_t_from_managed_ff_handle_ptr(
     std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
   if (managed_handle.has_value()) {
     return gpu_make_device_handle_t(managed_handle.value()->raw_handle());
diff --git a/lib/kernels/test/src/kernels/linear_kernels.cc b/lib/kernels/test/src/kernels/linear_kernels.cc
index 423e6be4f1..e96c5ef243 100644
--- a/lib/kernels/test/src/kernels/linear_kernels.cc
+++ b/lib/kernels/test/src/kernels/linear_kernels.cc
@@ -91,7 +91,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       std::optional<LinearPerDeviceState> per_device_state = linear_init_kernel(
           /*device_type=*/device_type,
-          /*handle=*/device_handle_t_from_managed_handle(managed_handle),
+          /*handle=*/device_handle_t_from_managed_ff_handle(managed_handle),
           /*activation=*/attrs.activation,
           /*regularizer=*/attrs.regularizer,
           /*use_bias=*/attrs.use_bias,
@@ -203,7 +203,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       std::optional<LinearPerDeviceState> per_device_state = linear_init_kernel(
           /*device_type=*/device_type,
-          /*handle=*/device_handle_t_from_managed_handle(managed_handle),
+          /*handle=*/device_handle_t_from_managed_ff_handle(managed_handle),
           /*activation=*/attrs.activation,
           /*regularizer=*/attrs.regularizer,
           /*use_bias=*/true,
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
index ae9fc669d3..4445054dc6 100644
--- a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -15,7 +15,7 @@ DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
 device_handle_t device_handle_t_from_device_specific_managed_handle(
     DeviceSpecificManagedPerDeviceFFHandle const &device_specific,
     device_id_t device_idx) {
-  return device_handle_t_from_managed_handle_ptr(
+  return device_handle_t_from_managed_ff_handle_ptr(
       device_specific.get(device_idx));
 }
 

From 360ff5af43213a7f98630a0582a8ddf7f56ac174 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 11:13:11 -0700
Subject: [PATCH 104/113] Doc comments.

---
 .../include/realm-execution/atomic_dependency_set.h           | 4 ++++
 lib/realm-execution/include/realm-execution/dependency_set.h  | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/realm-execution/include/realm-execution/atomic_dependency_set.h b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
index da6ba86638..7d7214e42b 100644
--- a/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
+++ b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
@@ -6,6 +6,10 @@
 
 namespace FlexFlow {
 
+/**
+  * \brief Tracks dependencies for a given parallel tensor shard using the SWMR
+  * (single-writer, multiple-reader) algorithm.
+  */
 struct AtomicDependencySet {
 public:
   AtomicDependencySet() = delete;
diff --git a/lib/realm-execution/include/realm-execution/dependency_set.h b/lib/realm-execution/include/realm-execution/dependency_set.h
index bd6ab04cea..ba8e9dc9b5 100644
--- a/lib/realm-execution/include/realm-execution/dependency_set.h
+++ b/lib/realm-execution/include/realm-execution/dependency_set.h
@@ -9,7 +9,8 @@
 namespace FlexFlow {
 
 /**
- * @brief Tracks dependencies during execution of tasks.
+ * @brief Tracks dependencies on values during the execution of tasks, using the
+ * SWMR (single-writer, multiple-reader) algorithm.
  */
 struct DependencySet {
 public:

From b9ae626afc2e81957a4d9e5eaba60739fb086448 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 11:18:53 -0700
Subject: [PATCH 105/113] Remove DeviceSpecificManagedPerDeviceFFHandle alias.

---
 .../device_specific_managed_per_device_ff_handle.h  | 12 +++++-------
 .../include/realm-execution/distributed_ff_handle.h |  7 ++++---
 .../tasks/impl/ff_handle_init_return_task.h         |  4 ++--
 .../tasks/impl/ff_handle_init_task.h                |  2 +-
 .../tasks/impl/ff_handle_init_task_args.dtg.toml    |  2 +-
 .../include/realm-execution/tasks/impl/op_task.h    |  2 +-
 .../tasks/impl/op_task_args.dtg.toml                |  2 +-
 .../tasks/impl/per_device_op_state_init_task.h      |  2 +-
 .../per_device_op_state_init_task_args.dtg.toml     |  2 +-
 .../device_specific_managed_per_device_ff_handle.cc | 13 +++++++------
 .../src/realm-execution/distributed_ff_handle.cc    | 10 ++++++----
 .../tasks/impl/ff_handle_init_return_task.cc        | 12 ++++++------
 .../tasks/impl/ff_handle_init_task.cc               |  6 +++---
 .../src/realm-execution/tasks/impl/op_task.cc       |  4 ++--
 .../tasks/impl/per_device_op_state_init_task.cc     |  4 ++--
 .../impl/serializable_ff_handle_init_task_args.cc   |  2 +-
 16 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
index 9a42861fcd..b698e613b5 100644
--- a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
@@ -9,14 +9,12 @@
 
 namespace FlexFlow {
 
-using DeviceSpecificManagedPerDeviceFFHandle =
-    DeviceSpecificPtr<ManagedPerDeviceFFHandle>;
+DeviceSpecificPtr<ManagedPerDeviceFFHandle>
+    make_device_specific_managed_ff_handle(
+        device_id_t const &, std::optional<ManagedPerDeviceFFHandle *> const &);
 
-DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
-    device_id_t const &, std::optional<ManagedPerDeviceFFHandle *> const &);
-
-device_handle_t device_handle_t_from_device_specific_managed_handle(
-    DeviceSpecificManagedPerDeviceFFHandle const &, device_id_t);
+device_handle_t device_handle_t_from_device_specific_managed_ff_handle(
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &, device_id_t);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-execution/include/realm-execution/distributed_ff_handle.h b/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
index 2a500ff150..8409a234a7 100644
--- a/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
+++ b/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
@@ -17,14 +17,15 @@ struct DistributedFfHandle {
   DistributedFfHandle() = delete;
   explicit DistributedFfHandle(
       std::unordered_map<Realm::Processor,
-                         DeviceSpecificManagedPerDeviceFFHandle> const
+                         DeviceSpecificPtr<ManagedPerDeviceFFHandle>> const
           &handles);
 
-  DeviceSpecificManagedPerDeviceFFHandle const &
+  DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &
       at(Realm::Processor processor) const;
 
 private:
-  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
+  std::unordered_map<Realm::Processor,
+                     DeviceSpecificPtr<ManagedPerDeviceFFHandle>>
       handles;
 };
 
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
index f6a07e97d4..20942a14f7 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
@@ -28,8 +28,8 @@ void ff_handle_init_return_task_body(
 Realm::Event spawn_ff_handle_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
-    DeviceSpecificManagedPerDeviceFFHandle const &result,
-    DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &result,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> *origin_result_ptr,
     Realm::Event precondition);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
index 64384b6ae6..196129c300 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
@@ -30,7 +30,7 @@ Realm::Event spawn_ff_handle_init_task(
     Realm::Processor target_proc,
     size_t workSpaceSize,
     bool allowTensorOpMathConversion,
-    DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> *result_ptr,
     Realm::Event precondition);
 
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml
index 808a350091..553568e673 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml
@@ -23,4 +23,4 @@ type = "::FlexFlow::Realm::Processor"
 
 [[fields]]
 name = "origin_result_ptr"
-type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle *"
+type = "::FlexFlow::DeviceSpecificPtr<::FlexFlow::ManagedPerDeviceFFHandle> *"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
index 4aa0329a96..9ad8a6ed38 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -59,7 +59,7 @@ Realm::Event spawn_op_task(
     TensorInstanceBacking const &tensor_backing,
     std::optional<DeviceSpecificPtr<PerDeviceOpState>> const &device_state,
     ProfilingSettings const &profiling_settings,
-    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &device_handle,
     FFIterationConfig const &iteration_config,
     std::optional<OptimizerAttrs> const &optimizer_attrs,
     Realm::Event precondition);
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
index f6bb83fbca..90202bcbf3 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
@@ -32,7 +32,7 @@ type = "::FlexFlow::ProfilingSettings"
 
 [[fields]]
 name = "device_handle"
-type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle"
+type = "::FlexFlow::DeviceSpecificPtr<::FlexFlow::ManagedPerDeviceFFHandle>"
 
 [[fields]]
 name = "iteration_config"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
index 95b768a245..11437d5df8 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
@@ -38,7 +38,7 @@ std::optional<Realm::Event> spawn_per_device_op_state_init_task(
     DynamicNodeInvocation const &invocation,
     TensorInstanceBacking const &tensor_backing,
     ProfilingSettings const &profiling_settings,
-    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &device_handle,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs,
     DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
index 57012ce716..6711050f1a 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
@@ -29,7 +29,7 @@ type = "::FlexFlow::ProfilingSettings"
 
 [[fields]]
 name = "device_handle"
-type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle"
+type = "::FlexFlow::DeviceSpecificPtr<::FlexFlow::ManagedPerDeviceFFHandle>"
 
 [[fields]]
 name = "iteration_config"
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
index 4445054dc6..c16f17d168 100644
--- a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -6,14 +6,15 @@
 
 namespace FlexFlow {
 
-DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
-    device_id_t const &device_id,
-    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
-  return DeviceSpecificManagedPerDeviceFFHandle{device_id, managed_handle};
+DeviceSpecificPtr<ManagedPerDeviceFFHandle>
+    make_device_specific_managed_ff_handle(
+        device_id_t const &device_id,
+        std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
+  return DeviceSpecificPtr<ManagedPerDeviceFFHandle>{device_id, managed_handle};
 }
 
-device_handle_t device_handle_t_from_device_specific_managed_handle(
-    DeviceSpecificManagedPerDeviceFFHandle const &device_specific,
+device_handle_t device_handle_t_from_device_specific_managed_ff_handle(
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &device_specific,
     device_id_t device_idx) {
   return device_handle_t_from_managed_ff_handle_ptr(
       device_specific.get(device_idx));
diff --git a/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc b/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
index 986401956a..2fdc31d2c5 100644
--- a/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
@@ -7,10 +7,11 @@ namespace FlexFlow {
 
 DistributedFfHandle::DistributedFfHandle(
     std::unordered_map<Realm::Processor,
-                       DeviceSpecificManagedPerDeviceFFHandle> const &handles)
+                       DeviceSpecificPtr<ManagedPerDeviceFFHandle>> const
+        &handles)
     : handles(handles) {}
 
-DeviceSpecificManagedPerDeviceFFHandle const &
+DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &
     DistributedFfHandle::at(Realm::Processor processor) const {
   return this->handles.at(processor);
 }
@@ -20,7 +21,8 @@ DistributedFfHandle
                                  size_t workSpaceSize,
                                  bool allowTensorOpMathConversion,
                                  Realm::Event precondition) {
-  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
+  std::unordered_map<Realm::Processor,
+                     DeviceSpecificPtr<ManagedPerDeviceFFHandle>>
       handles;
 
   // Allocate space for the result before launching any tasks
@@ -29,7 +31,7 @@ DistributedFfHandle
     if (proc.kind() == Realm::Processor::LOC_PROC ||
         proc.kind() == Realm::Processor::TOC_PROC) {
       handles.insert({proc,
-                      make_device_specific_managed_handle(
+                      make_device_specific_managed_ff_handle(
                           ctx.get_current_device_idx(), std::nullopt)});
     }
   }
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
index dde41d0838..6aab724a2a 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
@@ -7,16 +7,16 @@ struct FfHandleInitReturnTaskArgs {
 public:
   FfHandleInitReturnTaskArgs() = delete;
   FfHandleInitReturnTaskArgs(
-      DeviceSpecificManagedPerDeviceFFHandle result,
+      DeviceSpecificPtr<ManagedPerDeviceFFHandle> result,
       Realm::Processor origin_proc,
-      DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr)
+      DeviceSpecificPtr<ManagedPerDeviceFFHandle> *origin_result_ptr)
       : result(result), origin_proc(origin_proc),
         origin_result_ptr(origin_result_ptr) {}
 
 public:
-  DeviceSpecificManagedPerDeviceFFHandle result;
+  DeviceSpecificPtr<ManagedPerDeviceFFHandle> result;
   Realm::Processor origin_proc;
-  DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr;
+  DeviceSpecificPtr<ManagedPerDeviceFFHandle> *origin_result_ptr;
 };
 static_assert(std::is_trivially_copyable_v<FfHandleInitReturnTaskArgs>);
 
@@ -36,8 +36,8 @@ void ff_handle_init_return_task_body(void const *args,
 Realm::Event spawn_ff_handle_init_return_task(
     RealmContext &ctx,
     Realm::Processor origin_proc,
-    DeviceSpecificManagedPerDeviceFFHandle const &result,
-    DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &result,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> *origin_result_ptr,
     Realm::Event precondition) {
   FfHandleInitReturnTaskArgs task_args{result, origin_proc, origin_result_ptr};
 
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
index 86d03e45f3..f47b957f32 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
@@ -37,8 +37,8 @@ void ff_handle_init_task_body(void const *args,
       deserialize_task_args<SerializableFfHandleInitTaskArgs>(args, arglen));
 
   RealmContext ctx{proc};
-  DeviceSpecificManagedPerDeviceFFHandle managed_handle =
-      make_device_specific_managed_handle(
+  DeviceSpecificPtr<ManagedPerDeviceFFHandle> managed_handle =
+      make_device_specific_managed_ff_handle(
           ctx.get_current_device_idx(),
           make_ff_handle_for_processor(proc,
                                        task_args.workSpaceSize,
@@ -56,7 +56,7 @@ Realm::Event spawn_ff_handle_init_task(
     Realm::Processor target_proc,
     size_t workSpaceSize,
     bool allowTensorOpMathConversion,
-    DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> *result_ptr,
     Realm::Event precondition) {
 
   FfHandleInitTaskArgs task_args = FfHandleInitTaskArgs{
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
index 2eaec4d6ea..0d20baa0a3 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -26,7 +26,7 @@ void op_task_body(void const *args,
 
   RealmContext ctx{proc};
   device_handle_t device_handle =
-      device_handle_t_from_device_specific_managed_handle(
+      device_handle_t_from_device_specific_managed_ff_handle(
           task_args.device_handle, ctx.get_current_device_idx());
 
   // Patch the invocation to include the provided instances
@@ -68,7 +68,7 @@ Realm::Event spawn_op_task(
     TensorInstanceBacking const &tensor_backing,
     std::optional<DeviceSpecificPtr<PerDeviceOpState>> const &device_state,
     ProfilingSettings const &profiling_settings,
-    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &device_handle,
     FFIterationConfig const &iteration_config,
     std::optional<OptimizerAttrs> const &optimizer_attrs,
     Realm::Event precondition) {
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
index c5ff8f39be..753fccf74b 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
@@ -30,7 +30,7 @@ void per_device_op_state_init_task_body(void const *args,
 
   RealmContext ctx{proc};
   device_handle_t device_handle =
-      device_handle_t_from_device_specific_managed_handle(
+      device_handle_t_from_device_specific_managed_ff_handle(
           task_args.device_handle, ctx.get_current_device_idx());
 
   // Patch the invocation to include the provided instances
@@ -79,7 +79,7 @@ std::optional<Realm::Event> spawn_per_device_op_state_init_task(
     DynamicNodeInvocation const &invocation,
     TensorInstanceBacking const &tensor_backing,
     ProfilingSettings const &profiling_settings,
-    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle> const &device_handle,
     FFIterationConfig const &iteration_config,
     OptimizerAttrs const &optimizer_attrs,
     DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
index 0aaa3dacae..6045ffe972 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
@@ -19,7 +19,7 @@ FfHandleInitTaskArgs ff_handle_init_task_args_from_serializable(
       /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
       /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
       /*origin_result_ptr=*/
-      reinterpret_cast<DeviceSpecificManagedPerDeviceFFHandle *>(
+      reinterpret_cast<DeviceSpecificPtr<ManagedPerDeviceFFHandle> *>(
           args.origin_result_ptr),
   };
 }

From cd854af8a438014a731cdaa723160546a297e3a5 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 11:45:06 -0700
Subject: [PATCH 106/113] (Parallel)LossConfig type.

---
 .../computation_graph_instance.h              |  7 ++---
 .../local-execution/loss_config.dtg.toml      | 22 +++++++++++++++
 .../computation_graph_instance.cc             | 14 ++++------
 .../cost_estimator/local_cost_estimator.cc    |  4 +--
 .../src/local-execution/loss_functions.cc     |  9 ++++---
 .../test/src/local-execution/test_e2e.cc      | 18 ++++++++-----
 .../parallel_loss_config.dtg.toml             | 27 +++++++++++++++++++
 .../include/realm-execution/pcg_instance.h    |  9 ++-----
 .../src/realm-execution/pcg_instance.cc       | 16 +++++------
 .../test/src/realm-execution/test_e2e.cc      | 22 +++++++++------
 10 files changed, 97 insertions(+), 51 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/loss_config.dtg.toml
 create mode 100644 lib/realm-execution/include/realm-execution/parallel_loss_config.dtg.toml

diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h b/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h
index 8e09b7960b..c43001397b 100644
--- a/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h
+++ b/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h
@@ -5,11 +5,10 @@
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
 #include "kernels/profiling_settings.dtg.h"
-#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "local-execution/loss_config.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "pcg/tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
@@ -45,9 +44,7 @@ struct ComputationGraphInstance {
 ComputationGraphInstance create_computation_graph_instance(
     ComputationGraph const &cg,
     OptimizerAttrs const &optimizer_attrs,
-    std::optional<LossAttrs> const &loss_attrs,
-    std::optional<GenericTensorAccessorR> label_tensor,
-    std::optional<tensor_guid_t> logit_tensor,
+    std::optional<LossConfig> const &loss,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     Allocator &allocator,
diff --git a/lib/local-execution/include/local-execution/loss_config.dtg.toml b/lib/local-execution/include/local-execution/loss_config.dtg.toml
new file mode 100644
index 0000000000..dbc28a5360
--- /dev/null
+++ b/lib/local-execution/include/local-execution/loss_config.dtg.toml
@@ -0,0 +1,22 @@
+namespace = "FlexFlow"
+name = "LossConfig"
+type = "struct"
+features = []
+
+includes = [
+  "op-attrs/ops/loss_functions/loss_attrs.dtg.h",
+  "kernels/accessor.h",
+  "pcg/tensor_guid_t.dtg.h",
+]
+
+[[fields]]
+name = "loss_attrs"
+type = "::FlexFlow::LossAttrs"
+
+[[fields]]
+name = "label_tensor"
+type = "::FlexFlow::GenericTensorAccessorR"
+
+[[fields]]
+name = "logit_tensor"
+type = "::FlexFlow::tensor_guid_t"
diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
index 8c3a30a82d..961dfae3f1 100644
--- a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
+++ b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
@@ -61,9 +61,7 @@ static GenericTensorAccessorW
 ComputationGraphInstance create_computation_graph_instance(
     ComputationGraph const &cg,
     OptimizerAttrs const &optimizer_attrs,
-    std::optional<LossAttrs> const &loss_attrs,
-    std::optional<GenericTensorAccessorR> label_tensor,
-    std::optional<tensor_guid_t> logit_tensor,
+    std::optional<LossConfig> const &loss,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     Allocator &allocator,
@@ -77,15 +75,13 @@ ComputationGraphInstance create_computation_graph_instance(
   std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> inputs =
       input_tensors;
   std::optional<DynamicValueAttrs> logit_grad_value;
-  if (loss_attrs.has_value()) {
+  if (loss.has_value()) {
+    auto [loss_attrs, label_tensor, logit_tensor] = assert_unwrap(loss);
     auto [loss_inserted_dg, label_v, logit_grad_v] = perform_loss_insertion(
-        dg,
-        assert_unwrap(loss_attrs),
-        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)},
-        std::nullopt);
+        dg, loss_attrs, dynamic_tensor_guid_t{logit_tensor}, std::nullopt);
     dg = loss_inserted_dg;
     logit_grad_value = logit_grad_v;
-    inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
+    inputs.insert(std::pair{label_v, label_tensor});
   }
 
   dg = perform_update_insertion(dg, optimizer_attrs);
diff --git a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
index b9cd0c238d..89010c543e 100644
--- a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
+++ b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
@@ -122,9 +122,7 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
   ComputationGraphInstance instance = create_computation_graph_instance(
       /*compgraph=*/cg,
       /*optimizer_attrs=*/optimizer_attrs,
-      /*loss_attrs=*/std::nullopt,
-      /*label_tensor=*/std::nullopt,
-      /*logit_tensor=*/std::nullopt,
+      /*loss=*/std::nullopt,
       /*input_tensors=*/{},
       /*allocator=*/allocator,
       /*profiling_settings=*/this->profiling_settings,
diff --git a/lib/local-execution/test/src/local-execution/loss_functions.cc b/lib/local-execution/test/src/local-execution/loss_functions.cc
index b885778b46..39aa5f138a 100644
--- a/lib/local-execution/test/src/local-execution/loss_functions.cc
+++ b/lib/local-execution/test/src/local-execution/loss_functions.cc
@@ -97,9 +97,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           create_computation_graph_instance(
               /*cg=*/computation_graph,
               /*optimizer=*/optimizer_attrs,
-              /*loss=*/loss_attrs,
-              /*label_tensor=*/label_tensor,
-              /*logit_tensor=*/logit_tensor,
+              /*loss=*/
+              LossConfig{
+                  /*loss_attrs=*/loss_attrs,
+                  /*label_tensor=*/label_tensor,
+                  /*logit_tensor=*/logit_tensor,
+              },
               /*input_tensors=*/input_tensors,
               /*allocator=*/allocator,
               /*profiling_settings=*/ProfilingSettings{0, 1},
diff --git a/lib/local-execution/test/src/local-execution/test_e2e.cc b/lib/local-execution/test/src/local-execution/test_e2e.cc
index 615ba204cf..15a8a4138f 100644
--- a/lib/local-execution/test/src/local-execution/test_e2e.cc
+++ b/lib/local-execution/test/src/local-execution/test_e2e.cc
@@ -149,9 +149,12 @@ TEST_SUITE(FF_TEST_SUITE) {
         create_computation_graph_instance(
             /*cg=*/computation_graph,
             /*optimizer=*/optimizer_attrs,
-            /*loss=*/loss_attrs,
-            /*label_tensor=*/label_tensor,
-            /*logit_tensor=*/t_linear_2,
+            /*loss=*/
+            LossConfig{
+                /*loss_attrs=*/loss_attrs,
+                /*label_tensor=*/label_tensor,
+                /*logit_tensor=*/t_linear_2,
+            },
             /*input_tensors=*/input_tensors,
             /*allocator=*/allocator,
             /*profiling_settings=*/ProfilingSettings{0, 0},
@@ -317,9 +320,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         create_computation_graph_instance(
             /*cg=*/computation_graph,
             /*optimizer=*/optimizer_attrs,
-            /*loss=*/loss_attrs,
-            /*label_tensor=*/label_tensor,
-            /*logit_tensor=*/t_linear_2,
+            /*loss=*/
+            LossConfig{
+                /*loss_attrs=*/loss_attrs,
+                /*label_tensor=*/label_tensor,
+                /*logit_tensor=*/t_linear_2,
+            },
             /*input_tensors=*/input_tensors,
             /*allocator=*/allocator,
             /*profiling_settings=*/ProfilingSettings{0, 0},
diff --git a/lib/realm-execution/include/realm-execution/parallel_loss_config.dtg.toml b/lib/realm-execution/include/realm-execution/parallel_loss_config.dtg.toml
new file mode 100644
index 0000000000..0a5f964a9e
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/parallel_loss_config.dtg.toml
@@ -0,0 +1,27 @@
+namespace = "FlexFlow"
+name = "ParallelLossConfig"
+type = "struct"
+features = []
+
+includes = [
+  "op-attrs/ops/loss_functions/loss_attrs.dtg.h",
+  "kernels/accessor.h",
+  "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h",
+  "pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h",
+]
+
+[[fields]]
+name = "loss_attrs"
+type = "::FlexFlow::LossAttrs"
+
+[[fields]]
+name = "label_tensor"
+type = "::FlexFlow::GenericTensorAccessorR"
+
+[[fields]]
+name = "logit_tensor"
+type = "::FlexFlow::parallel_tensor_guid_t"
+
+[[fields]]
+name = "loss_mapping"
+type = "::FlexFlow::MappedOperatorTaskGroup"
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance.h
index c615244722..2443e4e66a 100644
--- a/lib/realm-execution/include/realm-execution/pcg_instance.h
+++ b/lib/realm-execution/include/realm-execution/pcg_instance.h
@@ -1,16 +1,14 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
 
-#include "kernels/accessor.h"
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
 #include "kernels/profiling_settings.dtg.h"
-#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/parallel_loss_config.dtg.h"
 #include "realm-execution/per_device_op_state_backing.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.dtg.h"
@@ -85,10 +83,7 @@ PCGInstance create_pcg_instance(
     RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
-    std::optional<LossAttrs> const &loss_attrs,
-    std::optional<GenericTensorAccessorR> label_tensor,
-    std::optional<parallel_tensor_guid_t> logit_tensor,
-    std::optional<MappedOperatorTaskGroup> const &loss_mapping,
+    std::optional<ParallelLossConfig> const &loss,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance.cc
index 60d96eca49..28eeb4bd9d 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance.cc
@@ -79,10 +79,7 @@ PCGInstance create_pcg_instance(
     RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
     OptimizerAttrs const &optimizer_attrs,
-    std::optional<LossAttrs> const &loss_attrs,
-    std::optional<GenericTensorAccessorR> label_tensor,
-    std::optional<parallel_tensor_guid_t> logit_tensor,
-    std::optional<MappedOperatorTaskGroup> const &loss_mapping,
+    std::optional<ParallelLossConfig> const &loss,
     std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
         &input_tensors,
     ProfilingSettings const &profiling_settings,
@@ -96,15 +93,14 @@ PCGInstance create_pcg_instance(
   std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> inputs =
       input_tensors;
   std::optional<DynamicValueAttrs> logit_grad_value;
-  if (loss_attrs) {
+  if (loss.has_value()) {
+    auto [loss_attrs, label_tensor, logit_tensor, loss_mapping] =
+        assert_unwrap(loss);
     auto [dg2, label_v, logit_grad_v] = perform_loss_insertion(
-        dg,
-        assert_unwrap(loss_attrs),
-        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)},
-        loss_mapping);
+        dg, loss_attrs, dynamic_tensor_guid_t{logit_tensor}, loss_mapping);
     dg = dg2;
     logit_grad_value = logit_grad_v;
-    inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
+    inputs.insert(std::pair{label_v, label_tensor});
   }
 
   dg = perform_update_insertion(dg, optimizer_attrs);
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index a700123c67..626c7c316a 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -214,10 +214,13 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*ctx=*/ctx,
           /*mpcg=*/mpcg,
           /*optimizer=*/optimizer_attrs,
-          /*loss=*/loss_attrs,
-          /*label_tensor=*/label_tensor,
-          /*logit_tensor=*/t_linear_2,
-          /*loss_mapping=*/loss_mapping,
+          /*loss=*/
+          ParallelLossConfig{
+              /*loss_attrs=*/loss_attrs,
+              /*label_tensor=*/label_tensor,
+              /*logit_tensor=*/t_linear_2,
+              /*loss_mapping=*/loss_mapping,
+          },
           /*input_tensors=*/input_tensors,
           /*profiling_settings=*/ProfilingSettings{0, 0},
           /*device_handle=*/device_handle,
@@ -441,10 +444,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
               /*ctx=*/ctx,
               /*mpcg=*/mpcg,
               /*optimizer=*/optimizer_attrs,
-              /*loss=*/loss_attrs,
-              /*label_tensor=*/label_tensor,
-              /*logit_tensor=*/t_linear_2,
-              /*loss_mapping=*/loss_mapping,
+              /*loss=*/
+              ParallelLossConfig{
+                  /*loss_attrs=*/loss_attrs,
+                  /*label_tensor=*/label_tensor,
+                  /*logit_tensor=*/t_linear_2,
+                  /*loss_mapping=*/loss_mapping,
+              },
               /*input_tensors=*/input_tensors,
               /*profiling_settings=*/ProfilingSettings{0, 0},
               /*device_handle=*/device_handle,

From 114c39088e77262e4e0d78396217227b8e27d4f2 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 12:12:37 -0700
Subject: [PATCH 107/113] Apply more review feedback.

---
 .../include/realm-execution/realm_context.h           | 11 ++++++++++-
 .../include/realm-execution/realm_manager.h           |  2 +-
 .../distributed_per_device_op_state_initialization.cc |  8 +++-----
 .../src/realm-execution/pcg_instance.cc               |  4 ++--
 .../src/realm-execution/realm_allocator.cc            |  2 +-
 .../src/realm-execution/realm_context.cc              |  4 ++++
 .../src/realm-execution/realm_manager.cc              |  6 +++---
 ...ake_dynamic_open_dataflow_graph_from_mapped_pcg.h} |  2 +-
 ...ke_dynamic_open_dataflow_graph_from_mapped_pcg.cc} |  4 ++--
 9 files changed, 27 insertions(+), 16 deletions(-)
 rename lib/task-spec/include/task-spec/dynamic_graph/{make_dynamic_open_dataflow_graph_from_mpcg.h => make_dynamic_open_dataflow_graph_from_mapped_pcg.h} (97%)
 rename lib/task-spec/src/task-spec/dynamic_graph/{make_dynamic_open_dataflow_graph_from_mpcg.cc => make_dynamic_open_dataflow_graph_from_mapped_pcg.cc} (99%)

diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index 0d0b412130..e1180147fd 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -90,7 +90,16 @@ struct RealmContext {
   static std::optional<ManagedPerDeviceFFHandle>
       make_device_handle_for_processor(Realm::Processor processor);
 
-protected:
+  /**
+   * \brief Get the raw Realm runtime
+   *
+   * \note If you use the Realm runtime directly, you are responsible for
+   * waiting on all generated events to ensure that Realm can shut down
+   * correctly.
+   */
+  Realm::Runtime get_runtime();
+
+private:
   Realm::Runtime runtime;
   Realm::Processor processor;
   Allocator allocator;
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
index 22e8bd2e80..3984291641 100644
--- a/lib/realm-execution/include/realm-execution/realm_manager.h
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -17,7 +17,7 @@ namespace FlexFlow {
  */
 struct RealmManager : private RealmContext {
 public:
-  RealmManager(int *argc, char ***argv);
+  explicit RealmManager(int *argc, char ***argv);
   virtual ~RealmManager();
 
   RealmManager() = delete;
diff --git a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
index 1e02fcf5d5..1d02c73ea9 100644
--- a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
@@ -7,7 +7,6 @@
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "utils/containers/map_values.h"
-#include "utils/containers/transform.h"
 #include "utils/containers/values.h"
 #include "utils/optional.h"
 #include <optional>
@@ -66,12 +65,11 @@ PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
 
   ctx.get_outstanding_events().wait();
 
-  auto deref = [](DynamicNodeInvocation const &i,
-                  DeviceSpecificPtr<PerDeviceOpState> *const &p) {
-    return std::pair{i, *p};
+  auto deref = [](DeviceSpecificPtr<PerDeviceOpState> *const &p) {
+    return *p;
   };
   std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
-      result = transform(device_state_map, deref);
+      result = map_values(device_state_map, deref);
 
   for (DeviceSpecificPtr<PerDeviceOpState> *device_state_ptr :
        values(device_state_map)) {
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance.cc
index 28eeb4bd9d..8263a49b0a 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance.cc
@@ -13,7 +13,7 @@
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion.h"
-#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.h"
 #include "task-spec/dynamic_graph/pass_expansion.h"
 #include "task-spec/dynamic_graph/shard_expansion.h"
 #include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
@@ -87,7 +87,7 @@ PCGInstance create_pcg_instance(
     FFIterationConfig const &iteration_config) {
 
   DynamicOpenDataflowGraph dg =
-      make_dynamic_open_dataflow_graph_from_mpcg(mpcg);
+      make_dynamic_open_dataflow_graph_from_mapped_pcg(mpcg);
   dg = perform_pass_expansion(dg);
 
   std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> inputs =
diff --git a/lib/realm-execution/src/realm-execution/realm_allocator.cc b/lib/realm-execution/src/realm-execution/realm_allocator.cc
index 194210cf5a..9d5f3ff6b4 100644
--- a/lib/realm-execution/src/realm-execution/realm_allocator.cc
+++ b/lib/realm-execution/src/realm-execution/realm_allocator.cc
@@ -31,7 +31,7 @@ void *RealmAllocator::allocate(size_t requested_memory_size) {
   ready.wait();
   void *ptr =
       inst.pointer_untyped(/*offset=*/0, /*datalen=*/requested_memory_size);
-  ASSERT(ptr);
+  ASSERT(ptr != nullptr);
   this->ptr_instances.insert({ptr, inst});
   return ptr;
 }
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 4e981e7414..96beb63953 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -250,4 +250,8 @@ void RealmContext::discover_machine_topology() {
   }
 }
 
+Realm::Runtime RealmContext::get_runtime() {
+  return this->runtime;
+}
+
 } // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
index 5dbe4e91ae..e76be7054b 100644
--- a/lib/realm-execution/src/realm-execution/realm_manager.cc
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -6,7 +6,7 @@ namespace FlexFlow {
 
 RealmManager::RealmManager(int *argc, char ***argv)
     : RealmContext(Realm::Processor::NO_PROC) {
-  bool ok = this->runtime.init(argc, argv);
+  bool ok = this->get_runtime().init(argc, argv);
   ASSERT(ok);
 
   // Register all tasks at initialization time so we don't need to later
@@ -15,8 +15,8 @@ RealmManager::RealmManager(int *argc, char ***argv)
 
 RealmManager::~RealmManager() {
   Realm::Event outstanding = this->merge_outstanding_events();
-  this->runtime.shutdown(outstanding);
-  this->runtime.wait_for_shutdown();
+  this->get_runtime().shutdown(outstanding);
+  this->get_runtime().wait_for_shutdown();
 }
 
 ControllerTaskResult
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.h
similarity index 97%
rename from lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
rename to lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.h
index 758a0c2813..6a269ec3c9 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
+++ b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.h
@@ -6,7 +6,7 @@
 
 namespace FlexFlow {
 
-DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mapped_pcg(
     MappedParallelComputationGraph const &);
 
 } // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
similarity index 99%
rename from lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
rename to lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
index ced98dfd44..e06e7d5a32 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
@@ -1,4 +1,4 @@
-#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/pcg_operator_attrs.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
@@ -13,7 +13,7 @@
 
 namespace FlexFlow {
 
-DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mapped_pcg(
     MappedParallelComputationGraph const &mpcg) {
   DynamicOpenDataflowGraph result = make_empty_dynamic_open_dataflow_graph();
 

From b836cf4e8a6ce66c83b428f6d775ad1ded2e6912 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 12:16:55 -0700
Subject: [PATCH 108/113] Update doc comments.

---
 .../realm-execution/tasks/impl/ff_handle_init_return_task.h  | 5 +++++
 .../tasks/impl/per_device_op_state_init_return_task.h        | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
index 20942a14f7..c8be641998 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
@@ -22,6 +22,11 @@ void ff_handle_init_return_task_body(
  * \brief Launches the task (\ref ff_handle_init_return_task_body) for returning
  * the asynchronously-initialized \ref PerDeviceFFHandle.
  *
+ * \param origin_proc The processor to send the result to.
+ * \param result The result value.
+ * \param origin_result_ptr The pointer, on the origin processor, to which the
+ * result should be written.
+ *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
  */
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
index 46a4bab727..4a0739550c 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
@@ -23,6 +23,11 @@ void per_device_op_state_init_return_task_body(
  * \brief Launches the task (\ref per_device_op_state_init_return_task_body) for
  * returning the asynchronously-initialized \ref PerDeviceOpState.
  *
+ * \param origin_proc The processor to send the result to.
+ * \param result The result value.
+ * \param origin_result_ptr The pointer, on the origin processor, to which the
+ * result should be written.
+ *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
  */

From 931b116642aa204b37ca5a0b6d6151ed8edb59be Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 12:18:32 -0700
Subject: [PATCH 109/113] Format.

---
 .../distributed_per_device_op_state_initialization.cc         | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
index 1d02c73ea9..1d517a8fe4 100644
--- a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
@@ -65,9 +65,7 @@ PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
 
   ctx.get_outstanding_events().wait();
 
-  auto deref = [](DeviceSpecificPtr<PerDeviceOpState> *const &p) {
-    return *p;
-  };
+  auto deref = [](DeviceSpecificPtr<PerDeviceOpState> *const &p) { return *p; };
   std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
       result = map_values(device_state_map, deref);
 

From 015ce369f2d38c29675a961948073db7a7f9d151 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Mar 2026 13:35:18 -0700
Subject: [PATCH 110/113] Fix documentation tests.

---
 .../realm-execution/tasks/impl/ff_handle_init_return_task.h     | 2 ++
 .../tasks/impl/per_device_op_state_init_return_task.h           | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
index c8be641998..8c5a6802f7 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
@@ -22,10 +22,12 @@ void ff_handle_init_return_task_body(
  * \brief Launches the task (\ref ff_handle_init_return_task_body) for returning
  * the asynchronously-initialized \ref PerDeviceFFHandle.
  *
+ * \param ctx The Realm context.
  * \param origin_proc The processor to send the result to.
  * \param result The result value.
  * \param origin_result_ptr The pointer, on the origin processor, to which the
  * result should be written.
+ * \param precondition Event precondition to the resulting task.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
index 4a0739550c..82a9cdd168 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
@@ -23,10 +23,12 @@ void per_device_op_state_init_return_task_body(
  * \brief Launches the task (\ref per_device_op_state_init_return_task_body) for
  * returning the asynchronously-initialized \ref PerDeviceOpState.
  *
+ * \param ctx The Realm context.
  * \param origin_proc The processor to send the result to.
  * \param result The result value.
  * \param origin_result_ptr The pointer, on the origin processor, to which the
  * result should be written.
+ * \param precondition Event precondition to the resulting task.
  *
  * To understand how this fits into the broader structure of \ref
  * realm-execution, see \ref realm-execution-tasks.

From 50284921dd45847a2cefd509fa99a2cd28797b70 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Tue, 17 Mar 2026 15:45:03 -0700
Subject: [PATCH 111/113] Add some docs based on Elliott's PR comments

---
 flake.lock                                          |  8 ++++----
 .../per_device_op_state_init_task_args.dtg.toml     |  7 +++++++
 .../tensor_instance_backing.dtg.toml                | 13 +++++++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/flake.lock b/flake.lock
index 359fdb19a9..ca71a446a9 100644
--- a/flake.lock
+++ b/flake.lock
@@ -66,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1773224815,
-        "narHash": "sha256-A7JWZNzYwzMZigyqm8IzyiBu82iFznp+oZJzx0eZjmU=",
+        "lastModified": 1773786960,
+        "narHash": "sha256-XGta5Z2idBD9bAvdmx+6kN0GQpNruwNYq1BSONH1Sgo=",
         "ref": "refs/heads/master",
-        "rev": "d1db2bac548f66912d22023a3cece241ded1f503",
-        "revCount": 145,
+        "rev": "da1097f7ef7ecc659a2ed740203c1be8262de7fa",
+        "revCount": 147,
         "type": "git",
         "url": "https://git.sr.ht/~lockshaw/proj"
       },
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
index 6711050f1a..98bbdb6a7b 100644
--- a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
@@ -2,6 +2,13 @@ namespace = "FlexFlow"
 name = "PerDeviceOpStateInitTaskArgs"
 type = "struct"
 features = []
+docstring = '''
+\brief An encapsulation of the arguments passed to \ref per_device_op_state_init_return_task_body
+       by \ref spawn_per_device_op_state_init_return_task.
+
+For the meaning of specific fields, see the parameter descriptions for
+\ref spawn_per_device_op_state_init_return_task.
+'''
 
 includes = [
   "kernels/profiling_settings.dtg.h",
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
index 1105af4a92..051edb0b9f 100644
--- a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -29,3 +29,16 @@ src_includes = [
 [[fields]]
 name = "backing"
 type = "std::unordered_map<::FlexFlow::DynamicValueAttrs, std::pair<::FlexFlow::Realm::RegionInstance, ::FlexFlow::Realm::Event>>"
+docstring = '''
+The need to track a pair of RegionInstance and Event, rather than just a RegionInstance, is due to the fact that
+unlike in Legion, RegionInstance does not natively encode its own ready state. This gives you have a choice:
+
+1. Either block eagerly (to make sure all instances are ready and you never accidently use an instance before it becomes ready), or
+2. Carry the ready event around with the instance and block before you use it.
+
+The way Realm works, an instance always has exactly one ready event (i.e., a
+ready event is actually a unique key for the instance), so there's really only
+one semantic meaning that makes sense here. Thus, it's idiomatic Realm coding style
+to talk about any given asynchronous object handle as a pair of (Object handle,
+ready Event for Object).
+'''

From 4a8a1d20682810e8eef13ea3bd50706a735c98ea Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Tue, 17 Mar 2026 18:35:59 -0700
Subject: [PATCH 112/113] PR comment fixes

---
 cmake/realm.cmake                             |  4 +++
 ...apped_runtime_only_op_cost_estimate_key.cc |  2 +-
 .../unity_algorithm/graph_optimize_state.cc   |  2 +-
 .../test/src/local-execution/test_e2e.cc      |  4 +--
 lib/pcg/include/pcg/computation_graph.h       |  5 ++-
 .../parallel_computation_graph.h              | 10 ++----
 lib/pcg/src/pcg/computation_graph.cc          | 22 ++----------
 .../parallel_computation_graph.cc             | 36 ++-----------------
 .../parallel_computation_graph_builder.cc     | 24 ++++++-------
 lib/realm-execution/CMakeLists.txt            |  2 +-
 .../realm-execution/device_specific_ptr.h     | 21 +++++++++++
 .../test/src/realm-execution/test_e2e.cc      |  4 +--
 .../sub_parallel_computation_graph.h          |  4 +--
 .../output_expr_to_result_sub_pcg_mapping.cc  |  2 +-
 .../src/substitutions/pcg_pattern_match.cc    |  2 +-
 .../sub_parallel_computation_graph.cc         |  4 +--
 .../evaluate_substitution_output.cc           |  4 +--
 17 files changed, 62 insertions(+), 90 deletions(-)

diff --git a/cmake/realm.cmake b/cmake/realm.cmake
index 2d915f1211..91d8f1345c 100644
--- a/cmake/realm.cmake
+++ b/cmake/realm.cmake
@@ -1 +1,5 @@
+include(aliasing)
+
 find_package(Realm REQUIRED)
+
+alias_library(realm Realm::Realm)
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
index bf8d9fb70b..e3361fa356 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
@@ -24,7 +24,7 @@ UnmappedRuntimeOnlyOpCostEstimateKey
       map_values(get_incoming_weights(pcg, parallel_layer_guid),
                  get_tensor_shape),
       /*output_shapes=*/
-      map_values(get_layer_outputs(pcg, parallel_layer_guid), get_tensor_shape),
+      map_values(get_outgoing_tensors(pcg, parallel_layer_guid), get_tensor_shape),
   };
 }
 
diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
index 6187ce7b60..7e7a80018e 100644
--- a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
+++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -53,7 +53,7 @@ static std::unordered_multiset<std::tuple<
             });
 
     std::unordered_map<TensorSlotName, ParallelTensorAttrs> outputs =
-        map_values(get_layer_outputs(pcg, l),
+        map_values(get_outgoing_tensors(pcg, l),
                    [&](parallel_tensor_guid_t const &o) {
                      return get_parallel_tensor_attrs(pcg, o);
                    });
diff --git a/lib/local-execution/test/src/local-execution/test_e2e.cc b/lib/local-execution/test/src/local-execution/test_e2e.cc
index 15a8a4138f..da62d22071 100644
--- a/lib/local-execution/test/src/local-execution/test_e2e.cc
+++ b/lib/local-execution/test/src/local-execution/test_e2e.cc
@@ -61,7 +61,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
-        add_input_layer_with_grad(computation_graph, input_tensor_shape);
+        add_input_layer(computation_graph, input_tensor_shape);
     tensor_guid_t t_input =
         require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
 
@@ -228,7 +228,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
-        add_input_layer_with_grad(computation_graph, input_tensor_shape);
+        add_input_layer(computation_graph, input_tensor_shape);
     tensor_guid_t t_input =
         require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
 
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index cd42328fd1..8dfb2eedb4 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -25,9 +25,8 @@ LayerAddedResult add_layer(
         &outputs = std::nullopt);
 
 LayerAddedResult add_input_layer(ComputationGraph &computation_graph,
-                                 TensorShape const &tensor_shape);
-LayerAddedResult add_input_layer_with_grad(ComputationGraph &computation_graph,
-                                           TensorShape const &tensor_shape);
+                                 TensorShape const &tensor_shape,
+                                 CreateGrad create_grad = CreateGrad::NO);
 
 TensorAttrs get_tensor_attrs(ComputationGraph const &, tensor_guid_t const &);
 bool are_tensor_guid_shapes_equivalent(ComputationGraph const &cg,
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 21f33f6d3d..4c7453f850 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -30,11 +30,8 @@ ParallelLayerAddedResult add_parallel_layer(
         &outputs = std::nullopt);
 
 ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
-                                             TensorShape const &tensor_shape);
-
-ParallelLayerAddedResult
-    pcg_add_input_layer_with_grad(ParallelComputationGraph &pcg,
-                                  TensorShape const &tensor_shape);
+                                             TensorShape const &tensor_shape,
+                                             CreateGrad create_grad = CreateGrad::NO);
 
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer);
@@ -64,9 +61,6 @@ std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &,
                          parallel_layer_guid_t const &);
-std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
-    get_layer_outputs(ParallelComputationGraph const &,
-                      parallel_layer_guid_t const &);
 
 std::unordered_map<TensorSlotName, OperatorSpaceToParallelTensorSpaceMapping>
     pcg_get_operator_to_incoming_mappings(ParallelComputationGraph const &,
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index a78f179e66..4eac3d1cfa 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -114,7 +114,8 @@ LayerAddedResult add_layer(
 }
 
 LayerAddedResult add_input_layer(ComputationGraph &cg,
-                                 TensorShape const &tensor_shape) {
+                                 TensorShape const &tensor_shape,
+                                 CreateGrad create_grad) {
   LayerAttrs layer_attrs = LayerAttrs{
       /*op_attrs=*/ComputationGraphOpAttrs{InputAttrs{tensor_shape}},
       /*name=*/std::nullopt,
@@ -126,24 +127,7 @@ LayerAddedResult add_input_layer(ComputationGraph &cg,
                    /*weights=*/{},
                    /*outputs=*/
                    std::unordered_map<TensorSlotName, CreateGrad>{
-                       {TensorSlotName::OUTPUT, CreateGrad::NO},
-                   });
-}
-
-LayerAddedResult add_input_layer_with_grad(ComputationGraph &cg,
-                                           TensorShape const &tensor_shape) {
-  LayerAttrs layer_attrs = LayerAttrs{
-      /*op_attrs=*/ComputationGraphOpAttrs{InputAttrs{tensor_shape}},
-      /*name=*/std::nullopt,
-  };
-
-  return add_layer(cg,
-                   layer_attrs,
-                   /*inputs=*/{},
-                   /*weights=*/{},
-                   /*outputs=*/
-                   std::unordered_map<TensorSlotName, CreateGrad>{
-                       {TensorSlotName::OUTPUT, CreateGrad::YES},
+                       {TensorSlotName::OUTPUT, create_grad},
                    });
 }
 
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index 959747dbc7..a7d61d0644 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -123,7 +123,8 @@ ParallelLayerAddedResult add_parallel_layer(
 }
 
 ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
-                                             TensorShape const &tensor_shape) {
+                                             TensorShape const &tensor_shape,
+                                             CreateGrad create_grad) {
   ParallelLayerAttrs layer_attrs = ParallelLayerAttrs{
       /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
       /*name=*/std::nullopt,
@@ -137,28 +138,7 @@ ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
                             std::unordered_map<TensorSlotName, CreateGrad>{
                                 {
                                     TensorSlotName::OUTPUT,
-                                    CreateGrad::NO,
-                                },
-                            });
-}
-
-ParallelLayerAddedResult
-    pcg_add_input_layer_with_grad(ParallelComputationGraph &pcg,
-                                  TensorShape const &tensor_shape) {
-  ParallelLayerAttrs layer_attrs = ParallelLayerAttrs{
-      /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
-      /*name=*/std::nullopt,
-  };
-
-  return add_parallel_layer(/*pcg=*/pcg,
-                            /*layer_attrs=*/layer_attrs,
-                            /*inputs=*/{},
-                            /*weights=*/{},
-                            /*output_flags=*/
-                            std::unordered_map<TensorSlotName, CreateGrad>{
-                                {
-                                    TensorSlotName::OUTPUT,
-                                    CreateGrad::YES,
+                                    create_grad,
                                 },
                             });
 }
@@ -253,16 +233,6 @@ std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
                     });
 }
 
-std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
-    get_layer_outputs(ParallelComputationGraph const &pcg,
-                      parallel_layer_guid_t const &l) {
-  return map_values(get_outgoing_kwarg_dataflow_outputs_for_node(
-                        pcg.raw_graph, l.raw_graph_node),
-                    [](KwargDataflowOutput<TensorSlotName> const &o) {
-                      return parallel_tensor_guid_t{o};
-                    });
-}
-
 std::unordered_map<TensorSlotName, OperatorSpaceToParallelTensorSpaceMapping>
     pcg_get_operator_to_incoming_mappings(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &l) {
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index aae34c8080..fd314ebaea 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -69,7 +69,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -145,7 +145,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -200,7 +200,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -329,7 +329,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     CHECK(conv_bias_shape == correct_bias_shape);
 
     std::unordered_map<TensorSlotName, parallel_tensor_guid_t> conv_outputs =
-        get_layer_outputs(b.pcg, conv_guid);
+        get_outgoing_tensors(b.pcg, conv_guid);
     CHECK(conv_outputs.size() == 1);
 
     parallel_tensor_guid_t conv_output =
@@ -371,7 +371,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -413,7 +413,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -464,7 +464,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -508,7 +508,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -556,7 +556,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -602,7 +602,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -646,7 +646,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
@@ -695,7 +695,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("outputs") {
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> result =
-          get_layer_outputs(b.pcg, layer);
+          get_outgoing_tensors(b.pcg, layer);
       std::unordered_map<TensorSlotName, parallel_tensor_guid_t> correct = {
           {
               TensorSlotName::OUTPUT,
diff --git a/lib/realm-execution/CMakeLists.txt b/lib/realm-execution/CMakeLists.txt
index 08676525e1..25a51ada54 100644
--- a/lib/realm-execution/CMakeLists.txt
+++ b/lib/realm-execution/CMakeLists.txt
@@ -16,7 +16,7 @@ ff_add_library(
     spdlog
     task-spec
     utils
-    Realm::Realm
+    realm
 )
 
 add_subdirectory(test)
diff --git a/lib/realm-execution/include/realm-execution/device_specific_ptr.h b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
index 590b7dbc74..59bc1d0465 100644
--- a/lib/realm-execution/include/realm-execution/device_specific_ptr.h
+++ b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
@@ -6,6 +6,27 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief Holds a pointer into (potentially) remote memory which checks that
+ * the memory is local on access.
+ *
+ * There exist per-device states (i.e., \ref PerDeviceOpState and \ref
+ * FFHandle) that need to be created and managed by the central \ref
+ * term-controller "controller task". Since these are opaque pointers they
+ * can't be safely copied to and from the remote devices, so we instead
+ * transfer the pointers back-and-forth between workers and the controller
+ * task. To prevent accidentally accessing one of these pointers on the wrong
+ * device (as the pointer is only valid in the memory where it was created), we
+ * wrap them with \ref DeviceSpecificPtr, which holds the \ref device_idx_t
+ * where the pointer was created, and any attempt to interact with the raw
+ * pointer value (i.e., \ref DeviceSpecificPtr::get) checks that the current
+ * device matches the original device, and throws a readable error message if
+ * it does not.
+ *
+ * \note \ref DeviceSpecificPtr explicitly does not own the pointer that it holds, leaving
+ * lifetime management up to the user of the pointer. If you want a lifetime-managed version,
+ * see \ref DeviceSpecific.
+ */
 template <typename T>
 struct DeviceSpecificPtr {
 public:
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
index 626c7c316a..4a8edb3b6c 100644
--- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -75,7 +75,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
 
       ParallelLayerAddedResult inputs_layer =
-          pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+          pcg_add_input_layer(pcg, input_tensor_shape);
       parallel_tensor_guid_t t_input =
           require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
 
@@ -288,7 +288,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
 
     ParallelLayerAddedResult inputs_layer =
-        pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+        pcg_add_input_layer(pcg, input_tensor_shape);
     parallel_tensor_guid_t t_input =
         require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
 
diff --git a/lib/substitutions/include/substitutions/sub_parallel_computation_graph.h b/lib/substitutions/include/substitutions/sub_parallel_computation_graph.h
index 0b4ac7238c..cbfe3ab264 100644
--- a/lib/substitutions/include/substitutions/sub_parallel_computation_graph.h
+++ b/lib/substitutions/include/substitutions/sub_parallel_computation_graph.h
@@ -37,8 +37,8 @@ std::unordered_map<TensorSlotName, open_parallel_tensor_guid_t>
     get_layer_inputs(SubParallelComputationGraph const &,
                      parallel_layer_guid_t const &);
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
-    get_layer_outputs(SubParallelComputationGraph const &,
-                      parallel_layer_guid_t const &);
+    get_outgoing_tensors(SubParallelComputationGraph const &,
+                         parallel_layer_guid_t const &);
 
 std::unordered_set<SubParallelComputationGraphEdge> get_subgraph_incoming_edges(
     SubParallelComputationGraph const &,
diff --git a/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc b/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
index d263bb842a..2ad5b54a17 100644
--- a/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
@@ -17,7 +17,7 @@ bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
 
   for (auto const &[parallel_layer, output_graph_expr_node] : m.node_mapping) {
     std::unordered_map<TensorSlotName, parallel_tensor_guid_t> layer_outputs =
-        get_layer_outputs(spcg, parallel_layer);
+        get_outgoing_tensors(spcg, parallel_layer);
     std::unordered_map<TensorSlotName, OutputGraphExprNodeOutput>
         output_graph_expr_outputs =
             get_node_outputs(output_graph_expr, output_graph_expr_node);
diff --git a/lib/substitutions/src/substitutions/pcg_pattern_match.cc b/lib/substitutions/src/substitutions/pcg_pattern_match.cc
index 0596764747..498fd6c1bf 100644
--- a/lib/substitutions/src/substitutions/pcg_pattern_match.cc
+++ b/lib/substitutions/src/substitutions/pcg_pattern_match.cc
@@ -24,7 +24,7 @@ bidict<PatternNodeOutput, parallel_tensor_guid_t>
   for (auto const &[pattern_node, matched_layer] : match.node_assignment) {
     bidict<TensorSlotName, parallel_tensor_guid_t>
         matched_layer_output_tensors =
-            bidict_from_map(get_layer_outputs(spcg, matched_layer));
+            bidict_from_map(get_outgoing_tensors(spcg, matched_layer));
     bidict<TensorSlotName, PatternNodeOutput> pattern_node_outputs =
         bidict_from_map(get_pattern_node_outputs(pattern, pattern_node));
 
diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
index 12074bff33..34b8ae1e96 100644
--- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
+++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
@@ -91,8 +91,8 @@ std::unordered_map<TensorSlotName, open_parallel_tensor_guid_t>
 }
 
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
-    get_layer_outputs(SubParallelComputationGraph const &pcg,
-                      parallel_layer_guid_t const &layer) {
+    get_outgoing_tensors(SubParallelComputationGraph const &pcg,
+                         parallel_layer_guid_t const &layer) {
   return map_values(get_outgoing_kwarg_dataflow_outputs_for_node(
                         pcg.raw_graph, layer.raw_graph_node),
                     [](KwargDataflowOutput<TensorSlotName> const &o) {
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
index efebedb5df..cf70b0cf53 100644
--- a/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
@@ -270,13 +270,13 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_parallel_tensor_attrs(
               pcg,
               open_parallel_tensor_guid_from_closed(
-                  require_only_key(get_layer_outputs(pcg, relu_match_layer),
+                  require_only_key(get_outgoing_tensors(pcg, relu_match_layer),
                                    TensorSlotName::OUTPUT)));
 
       parallel_layer_guid_t result_fused_mm_relu_node =
           result_node_map.at_r(fused_mm_relu_node);
       parallel_tensor_guid_t result_fused_mm_relu_output = require_only_key(
-          get_layer_outputs(result_graph, result_fused_mm_relu_node),
+          get_outgoing_tensors(result_graph, result_fused_mm_relu_node),
           TensorSlotName::OUTPUT);
       input_parallel_tensor_guid_t result_i_activation =
           result_input_map.at_r(output_i_activation);

From ab3da7cd3eed4c16ae20625aebb6a733d7da02a5 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Tue, 17 Mar 2026 18:40:19 -0700
Subject: [PATCH 113/113] Format and doxygen fixes

---
 .../unmapped_runtime_only_op_cost_estimate_key.cc          | 3 ++-
 .../parallel_computation_graph.h                           | 7 ++++---
 .../include/realm-execution/device_specific_ptr.h          | 4 ++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
index e3361fa356..957d842493 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
@@ -24,7 +24,8 @@ UnmappedRuntimeOnlyOpCostEstimateKey
       map_values(get_incoming_weights(pcg, parallel_layer_guid),
                  get_tensor_shape),
       /*output_shapes=*/
-      map_values(get_outgoing_tensors(pcg, parallel_layer_guid), get_tensor_shape),
+      map_values(get_outgoing_tensors(pcg, parallel_layer_guid),
+                 get_tensor_shape),
   };
 }
 
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 4c7453f850..917200af68 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -29,9 +29,10 @@ ParallelLayerAddedResult add_parallel_layer(
     std::optional<std::unordered_map<TensorSlotName, CreateGrad>> const
         &outputs = std::nullopt);
 
-ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
-                                             TensorShape const &tensor_shape,
-                                             CreateGrad create_grad = CreateGrad::NO);
+ParallelLayerAddedResult
+    pcg_add_input_layer(ParallelComputationGraph &pcg,
+                        TensorShape const &tensor_shape,
+                        CreateGrad create_grad = CreateGrad::NO);
 
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer);
diff --git a/lib/realm-execution/include/realm-execution/device_specific_ptr.h b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
index 59bc1d0465..32f8c13272 100644
--- a/lib/realm-execution/include/realm-execution/device_specific_ptr.h
+++ b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
@@ -11,13 +11,13 @@ namespace FlexFlow {
  * the memory is local on access.
  *
  * There exist per-device states (i.e., \ref PerDeviceOpState and \ref
- * FFHandle) that need to be created and managed by the central \ref
+ * PerDeviceFFHandle) that need to be created and managed by the central \ref
  * term-controller "controller task". Since these are opaque pointers they
  * can't be safely copied to and from the remote devices, so we instead
  * transfer the pointers back-and-forth between workers and the controller
  * task. To prevent accidentally accessing one of these pointers on the wrong
  * device (as the pointer is only valid in the memory where it was created), we
- * wrap them with \ref DeviceSpecificPtr, which holds the \ref device_idx_t
+ * wrap them with \ref DeviceSpecificPtr, which holds the \ref device_id_t
  * where the pointer was created, and any attempt to interact with the raw
  * pointer value (i.e., \ref DeviceSpecificPtr::get) checks that the current
  * device matches the original device, and throws a readable error message if