From b6d00f897cd8f52165596debbcf274c21f4aa3b4 Mon Sep 17 00:00:00 2001
From: Seema Mirchandaney <seemah@sapling2.stanford.edu>
Date: Thu, 9 Apr 2026 15:49:29 -0700
Subject: [PATCH 1/6] Add support for replicate op in distributed training

- Add perform_pass_expansion_for_replicate for fwd/bwd pass expansion
- Add perform_shard_expansion_for_replicate and _bwd for shard expansion
- Add build_replicate_invocation in make_dynamic_open_dataflow_graph
- Add is_replicate_attrs helper and guard replicate in copy_insertion
- Add ReplicateAttrs to TrainingOperationAttrs
- Add SumReductionFloat/Double for backward replicate reduce operation
- Add issue_replicate_bwd in spawn_dynamic_node_invocation
- Fix per_device_op_state init race condition with direct write
- Fix .value() calls on optional per_device_op_state across op impls
- Update issue_copy to support optional reduction op
- Add testcase for replicate op
---
 .../include/realm-execution/sum_reduction.h   |  99 ++++
 .../realm-execution/tasks/realm_reduction.h   |  96 ++++
 .../src/realm-execution/test_op_replicate.cc  | 450 ++++++++++++++++++
 3 files changed, 645 insertions(+)
 create mode 100644 lib/realm-execution/include/realm-execution/sum_reduction.h
 create mode 100644 lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
 create mode 100644 lib/realm-execution/test/src/realm-execution/test_op_replicate.cc
diff --git a/lib/realm-execution/include/realm-execution/sum_reduction.h b/lib/realm-execution/include/realm-execution/sum_reduction.h
new file mode 100644
index 0000000000..b845b5b7f2
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/sum_reduction.h
@@ -0,0 +1,99 @@
+#pragma once
+#include <realm.h>
+#include "op-attrs/datatype.dtg.h"
+
+namespace FlexFlow {
+
+// Sum reduction for float
+struct SumReductionFloat {
+  using LHS = float;
+  using RHS = float;
+  static const RHS identity;
+
+  template <bool EXCLUSIVE>
+  static void apply(LHS &lhs, RHS rhs) {
+    if (EXCLUSIVE) {
+      lhs += rhs;
+    } else {
+      // atomic add for non-exclusive
+      __sync_fetch_and_add((int*)&lhs, *(int*)&rhs);  
+      // proper float atomic add — use union trick
+      union { float f; int i; } old_val, new_val;
+      do {
+        old_val.f = lhs;
+        new_val.f = old_val.f + rhs;
+      } while (!__sync_bool_compare_and_swap(
+          (int*)&lhs, old_val.i, new_val.i));
+    }
+  }
+
+  template <bool EXCLUSIVE>
+  static void fold(RHS &rhs1, RHS rhs2) {
+    if (EXCLUSIVE) {
+      rhs1 += rhs2;
+    } else {
+      union { float f; int i; } old_val, new_val;
+      do {
+        old_val.f = rhs1;
+        new_val.f = old_val.f + rhs2;
+      } while (!__sync_bool_compare_and_swap(
+          (int*)&rhs1, old_val.i, new_val.i));
+    }
+  }
+};
+
+const SumReductionFloat::RHS SumReductionFloat::identity = 0.0f;
+
+// Sum reduction for double
+struct SumReductionDouble {
+  using LHS = double;
+  using RHS = double;
+  static const RHS identity;
+
+  template <bool EXCLUSIVE>
+  static void apply(LHS &lhs, RHS rhs) {
+    if (EXCLUSIVE) {
+      lhs += rhs;
+    } else {
+      union { double d; long long i; } old_val, new_val;
+      do {
+        old_val.d = lhs;
+        new_val.d = old_val.d + rhs;
+      } while (!__sync_bool_compare_and_swap(
+          (long long*)&lhs, old_val.i, new_val.i));
+    }
+  }
+
+  template <bool EXCLUSIVE>
+  static void fold(RHS &rhs1, RHS rhs2) {
+    if (EXCLUSIVE) {
+      rhs1 += rhs2;
+    } else {
+      union { double d; long long i; } old_val, new_val;
+      do {
+        old_val.d = rhs1;
+        new_val.d = old_val.d + rhs2;
+      } while (!__sync_bool_compare_and_swap(
+          (long long*)&rhs1, old_val.i, new_val.i));
+    }
+  }
+};
+
+const SumReductionDouble::RHS SumReductionDouble::identity = 0.0;
+
+// Reduction op IDs — must not conflict with other registered redops
+enum SumReductionOpIDs {
+  REDOP_SUM_FLOAT  = 1,
+  REDOP_SUM_DOUBLE = 2,
+};
+
+inline Realm::ReductionOpID get_sum_reduction_op_id(DataType dtype) {
+  switch (dtype) {
+    case DataType::FLOAT:  return REDOP_SUM_FLOAT;
+    case DataType::DOUBLE: return REDOP_SUM_DOUBLE;
+    default:
+      PANIC("no sum reduction registered for datatype {}", dtype);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h b/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
new file mode 100644
index 0000000000..d1d6e1d880
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
@@ -0,0 +1,96 @@
+#pragma once
+#include <realm.h>
+#include "op-attrs/datatype.dtg.h"
+
+namespace FlexFlow {
+
+// Sum reduction for float
+struct SumReductionFloat {
+  using LHS = float;
+  using RHS = float;
+  static constexpr RHS identity = 0.0f;  // ← inside struct, constexpr
+
+  template <bool EXCLUSIVE>
+  static void apply(LHS &lhs, RHS rhs) {
+    if (EXCLUSIVE) {
+      lhs += rhs;
+    } else {
+      // atomic add for non-exclusive
+      __sync_fetch_and_add((int*)&lhs, *(int*)&rhs);  
+      // proper float atomic add — use union trick
+      union { float f; int i; } old_val, new_val;
+      do {
+        old_val.f = lhs;
+        new_val.f = old_val.f + rhs;
+      } while (!__sync_bool_compare_and_swap(
+          (int*)&lhs, old_val.i, new_val.i));
+    }
+  }
+
+  template <bool EXCLUSIVE>
+  static void fold(RHS &rhs1, RHS rhs2) {
+    if (EXCLUSIVE) {
+      rhs1 += rhs2;
+    } else {
+      union { float f; int i; } old_val, new_val;
+      do {
+        old_val.f = rhs1;
+        new_val.f = old_val.f + rhs2;
+      } while (!__sync_bool_compare_and_swap(
+          (int*)&rhs1, old_val.i, new_val.i));
+    }
+  }
+};
+
+
+// Sum reduction for double
+struct SumReductionDouble {
+  using LHS = double;
+  using RHS = double;
+  static constexpr RHS identity = 0.0;  // ← inside struct, constexpr  
+
+  template <bool EXCLUSIVE>
+  static void apply(LHS &lhs, RHS rhs) {
+    if (EXCLUSIVE) {
+      lhs += rhs;
+    } else {
+      union { double d; long long i; } old_val, new_val;
+      do {
+        old_val.d = lhs;
+        new_val.d = old_val.d + rhs;
+      } while (!__sync_bool_compare_and_swap(
+          (long long*)&lhs, old_val.i, new_val.i));
+    }
+  }
+
+  template <bool EXCLUSIVE>
+  static void fold(RHS &rhs1, RHS rhs2) {
+    if (EXCLUSIVE) {
+      rhs1 += rhs2;
+    } else {
+      union { double d; long long i; } old_val, new_val;
+      do {
+        old_val.d = rhs1;
+        new_val.d = old_val.d + rhs2;
+      } while (!__sync_bool_compare_and_swap(
+          (long long*)&rhs1, old_val.i, new_val.i));
+    }
+  }
+};
+
+// Reduction op IDs — must not conflict with other registered redops
+enum SumReductionOpIDs {
+  REDOP_SUM_FLOAT  = 1,
+  REDOP_SUM_DOUBLE = 2,
+};
+
+inline Realm::ReductionOpID get_sum_reduction_op_id(DataType dtype) {
+  switch (dtype) {
+    case DataType::FLOAT:  return REDOP_SUM_FLOAT;
+    case DataType::DOUBLE: return REDOP_SUM_DOUBLE;
+    default:
+      PANIC("no sum reduction registered for datatype {}", dtype);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/realm-execution/test_op_replicate.cc b/lib/realm-execution/test/src/realm-execution/test_op_replicate.cc
new file mode 100644
index 0000000000..d1fc941007
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/test_op_replicate.cc
@@ -0,0 +1,450 @@
+#include "internal/realm_test_utils.h"
+#include "kernels/allocation.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "op-attrs/operator_task_space_to_operator_task_space_mapping.h"
+#include "op-attrs/ops/element_unary.h"
+#include "op-attrs/ops/linear.h"
+#include "op-attrs/ops/replicate.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/pcg_instance.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/realm_manager.h"
+#include "task-spec/permissions.h"
+#include "test/utils/doctest/check_kv.h"
+#include "utils/containers/require_only_key.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+template <typename T>
+static ParallelLayerAttrs make_layer_attrs(T const &op_attrs) {
+  return ParallelLayerAttrs{
+      /*op_attrs=*/PCGOperatorAttrs{op_attrs},
+      /*name=*/std::nullopt,
+  };
+};
+
+static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                              GenericTensorAccessorR const &last_epoch,
+                              Allocator &allocator) {
+  return tensor_accessor_all(
+      compare_tensor_accessors_le(last_epoch, first_epoch, allocator));
+}
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training Replicate Op (CPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+    ControllerTaskResult result = manager.start_controller([](RealmContext
+                                                                  &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      positive_int batch_size = 10_p;
+      positive_int data_dim = 16_p;
+      positive_int hidden_dim = 32_p;
+      positive_int output_dim = 1_p;
+
+      // 10,2
+      TensorShape output_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+      // 10,2
+      TensorShape label_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      // construct computation graph
+      ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+      // input tensor
+      // 10, 16
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+      // parallel layer -> input tensor
+      ParallelLayerAddedResult inputs_layer =
+          pcg_add_input_layer(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input =
+          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+      // parallel layer -> input tensor 2
+      ParallelLayerAddedResult inputs_layer_2 =
+          pcg_add_input_layer(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input_2 =
+          require_only_key(inputs_layer_2.outputs, TensorSlotName::OUTPUT);
+
+      // binary  ADD attribute
+      ElementBinaryAttrs add_attrs = ElementBinaryAttrs{
+          OperatorType::EW_ADD,
+          DataType::FLOAT,
+          false,
+          false,
+      };
+
+      // parallel layer -> perform add
+      ParallelLayerAddedResult add_operator_1 =
+          add_parallel_layer(pcg, make_layer_attrs(add_attrs),
+                             {
+                                 {
+                                     TensorSlotName::LHS_INPUT,
+                                     t_input,
+                                 },
+                                 {
+                                     TensorSlotName::RHS_INPUT,
+                                     t_input_2,
+                                 },
+                             },
+                             {/* weight */});
+
+      parallel_tensor_guid_t t_add_1 =
+          require_only_key(add_operator_1.outputs, TensorSlotName::OUTPUT);
+
+      // parallel layer -> perform replicate
+      const positive_int replicate_degree = 2_p;
+      ReplicateAttrs repl_attrs = ReplicateAttrs(replicate_degree);
+      ParallelLayerAddedResult repl_operator_1 =
+          add_parallel_layer(pcg, make_layer_attrs(repl_attrs),
+                             {
+                                 {
+                                     TensorSlotName::INPUT,
+                                     t_add_1,
+                                 },
+                             },
+                             /*weight=*/{});
+      // output of replicate layer
+      parallel_tensor_guid_t t_repl_1 =
+          require_only_key(repl_operator_1.outputs, TensorSlotName::OUTPUT);
+
+      // parallel layer -> perform  RelU
+      ParallelLayerAddedResult relu_operator_1 =
+          add_parallel_layer(pcg, make_layer_attrs(make_relu_attrs()),
+                             /*inputs=*/
+                             {
+                                 {
+                                     TensorSlotName::INPUT,
+                                     t_repl_1,
+                                 },
+                             },
+                             /*weights=*/{});
+      // output of relu layer
+      parallel_tensor_guid_t t_relu_1 =
+          require_only_key(relu_operator_1.outputs, TensorSlotName::OUTPUT);
+
+      // machine
+      MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+      MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
+
+      ParallelTensorSpaceCoordinate tensor_coord0{
+          /* sum_component */ 0_n, /* discard_copy_component */ 0_n,
+          /*shard_component*/ FFOrdered{0_n}};
+      ParallelTensorSpaceCoordinate tensor_coord1{
+          /* sum_component */ 0_n, /* discard_copy_component */ 1_n,
+          /*shard_component*/ FFOrdered{0_n}};
+      MappedParallelComputationGraph mpcg{
+          pcg,
+          {{inputs_layer.parallel_layer,
+            MappedOperatorTaskGroup{
+                {{cpu0, OperatorAtomicTaskShardBinding{{{TensorSlotName::OUTPUT,
+                                                         tensor_coord0}}}}}}},
+           {inputs_layer_2.parallel_layer,
+            MappedOperatorTaskGroup{
+                {{cpu0, OperatorAtomicTaskShardBinding{{{TensorSlotName::OUTPUT,
+                                                         tensor_coord0}}}}}}},
+           {add_operator_1.parallel_layer,
+            MappedOperatorTaskGroup{
+                {{cpu0, OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::LHS_INPUT, tensor_coord0},
+                            {TensorSlotName::RHS_INPUT, tensor_coord0},
+                            {TensorSlotName::OUTPUT, tensor_coord0},
+                        }}}}}},
+           {repl_operator_1.parallel_layer,
+            MappedOperatorTaskGroup{{
+                {cpu0, OperatorAtomicTaskShardBinding{{
+                           {TensorSlotName::OUTPUT, tensor_coord0},
+                       }}},
+                {cpu1, OperatorAtomicTaskShardBinding{{
+                           {TensorSlotName::OUTPUT, tensor_coord1},
+                       }}},
+				     }}},
+	   {relu_operator_1.parallel_layer,
+                 MappedOperatorTaskGroup{{
+                     {cpu0, OperatorAtomicTaskShardBinding{{
+                                {TensorSlotName::INPUT, tensor_coord0},
+                                {TensorSlotName::OUTPUT, tensor_coord0},
+                            }}},
+                     {cpu1, OperatorAtomicTaskShardBinding{{
+                                {TensorSlotName::INPUT, tensor_coord1},
+                                {TensorSlotName::OUTPUT, tensor_coord1},
+                            }}},
+                 }}}},
+      };
+
+      MappedOperatorTaskGroup loss_mapping{
+          {{cpu0, OperatorAtomicTaskShardBinding{{
+                      {TensorSlotName::INPUT, tensor_coord0},
+                      {TensorSlotName::LOGIT, tensor_coord0},
+                  }}}}};
+
+      // instantiate computation graph
+      LossAttrs loss_attrs = LossAttrs{
+          NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                           /*momentum=*/0.9,
+                                           /*nesterov=*/false,
+                                           /*weight_decay=*/0.001}};
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedFfHandle device_handle =
+          create_distributed_ff_handle(ctx,
+                                       /*workSpaceSize=*/1024 * 1024,
+                                       /*allowTensorOpMathConversion=*/true);
+      PCGInstance pcg_instance = create_pcg_instance(
+          /*ctx=*/ctx,
+          /*mpcg=*/mpcg,
+          /*optimizer=*/optimizer_attrs,
+          /*loss=*/std::nullopt,
+          /*input_tensors=*/input_tensors,
+          /*profiling_settings=*/ProfilingSettings{0, 0},
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/FFIterationConfig{1_p});
+
+      // begin training loop
+      int num_epochs = 1;
+      for (int i = 0; i < num_epochs; i++) {
+        perform_all_passes_for_pcg_instance(
+            /*instance=*/pcg_instance,
+            /*profiling_settings=*/ProfilingSettings{0, 0},
+            /*device_handle=*/device_handle,
+            /*iteration_config=*/FFIterationConfig{1_p});
+      }
+    });
+    result.wait();
+  }
+}
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training Replicate Op (GPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/2_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+
+    ControllerTaskResult result =
+        manager.start_controller([](RealmContext &ctx) {
+          Allocator allocator = ctx.get_current_device_allocator();
+
+          positive_int batch_size = 10_p;
+          positive_int data_dim = 16_p;
+          positive_int hidden_dim = 32_p;
+          positive_int output_dim = 1_p;
+
+          // 10,2
+          TensorShape output_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+          // 10,2
+          TensorShape label_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+          GenericTensorAccessorW label_tensor =
+              allocator.allocate_tensor(label_tensor_shape);
+
+          // construct computation graph
+          ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+          // input tensor
+          // 10, 16
+          TensorShape input_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+          // parallel layer -> input tensor
+          ParallelLayerAddedResult inputs_layer =
+              pcg_add_input_layer(pcg, input_tensor_shape);
+          parallel_tensor_guid_t t_input =
+              require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+          // parallel layer -> input tensor 2
+          ParallelLayerAddedResult inputs_layer_2 =
+              pcg_add_input_layer(pcg, input_tensor_shape);
+          parallel_tensor_guid_t t_input_2 =
+              require_only_key(inputs_layer_2.outputs, TensorSlotName::OUTPUT);
+
+          // binary  ADD attribute
+          ElementBinaryAttrs add_attrs = ElementBinaryAttrs{
+              OperatorType::EW_ADD,
+              DataType::FLOAT,
+              false,
+              false,
+          };
+
+          // parallel layer -> perform add
+          ParallelLayerAddedResult add_operator_1 =
+              add_parallel_layer(pcg, make_layer_attrs(add_attrs),
+                                 {
+                                     {
+                                         TensorSlotName::LHS_INPUT,
+                                         t_input,
+                                     },
+                                     {
+                                         TensorSlotName::RHS_INPUT,
+                                         t_input_2,
+                                     },
+                                 },
+                                 {/* weight */});
+
+          parallel_tensor_guid_t t_add_1 =
+              require_only_key(add_operator_1.outputs, TensorSlotName::OUTPUT);
+
+          // parallel layer -> perform replicate
+          const positive_int replicate_degree = 2_p;
+          ReplicateAttrs repl_attrs = ReplicateAttrs(replicate_degree);
+          ParallelLayerAddedResult repl_operator_1 =
+              add_parallel_layer(pcg, make_layer_attrs(repl_attrs),
+                                 {
+                                     {
+                                         TensorSlotName::INPUT,
+                                         t_add_1,
+                                     },
+                                 },
+                                 /*weight=*/{});
+          // output of replicate layer
+          parallel_tensor_guid_t t_repl_1 =
+              require_only_key(repl_operator_1.outputs, TensorSlotName::OUTPUT);
+
+          // parallel layer -> perform  RelU
+          ParallelLayerAddedResult relu_operator_1 =
+              add_parallel_layer(pcg, make_layer_attrs(make_relu_attrs()),
+                                 /*inputs=*/
+                                 {
+                                     {
+                                         TensorSlotName::INPUT,
+                                         t_repl_1,
+                                     },
+                                 },
+                                 /*weights=*/{});
+          // output of relu layer
+          parallel_tensor_guid_t t_relu_1 =
+              require_only_key(relu_operator_1.outputs, TensorSlotName::OUTPUT);
+
+          // machine
+          MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
+          MachineSpaceCoordinate gpu1{0_n, 1_n, DeviceType::GPU};
+	  ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+	  ParallelTensorSpaceCoordinate tensor_coord1{0_n, 1_n, FFOrdered{0_n}};
+          MappedParallelComputationGraph mpcg{
+              pcg,
+              {
+                  {inputs_layer.parallel_layer,
+                   MappedOperatorTaskGroup{
+                       {{gpu0,
+                         OperatorAtomicTaskShardBinding{
+                             {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+                  {inputs_layer_2.parallel_layer,
+                   MappedOperatorTaskGroup{
+                       {{gpu0,
+                         OperatorAtomicTaskShardBinding{
+                             {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+                  {add_operator_1.parallel_layer,
+                   MappedOperatorTaskGroup{
+                       {{gpu0, OperatorAtomicTaskShardBinding{{
+                                   {TensorSlotName::LHS_INPUT, tensor_coord0},
+                                   {TensorSlotName::RHS_INPUT, tensor_coord0},
+                                   {TensorSlotName::OUTPUT, tensor_coord0},
+                               }}}}}},
+                  {repl_operator_1.parallel_layer,
+		   MappedOperatorTaskGroup{{
+		    {gpu0, OperatorAtomicTaskShardBinding{{
+                           {TensorSlotName::OUTPUT, tensor_coord0},
+                       }}},
+                     {gpu1, OperatorAtomicTaskShardBinding{{
+                           {TensorSlotName::OUTPUT, tensor_coord1},					   
+		    }}}}}},
+                  {relu_operator_1.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {gpu0, OperatorAtomicTaskShardBinding{{
+                                  {TensorSlotName::INPUT, tensor_coord0},
+                                  {TensorSlotName::OUTPUT, tensor_coord0},
+                              }}},
+                       {gpu1, OperatorAtomicTaskShardBinding{{
+                                  {TensorSlotName::INPUT, tensor_coord1},
+                                  {TensorSlotName::OUTPUT, tensor_coord1},
+                              }}},
+                   }}},
+              },
+          };
+
+          MappedOperatorTaskGroup loss_mapping{
+              {{gpu0, OperatorAtomicTaskShardBinding{{
+                          {TensorSlotName::INPUT, tensor_coord0},
+                          {TensorSlotName::LOGIT, tensor_coord0},
+                      }}}}};
+
+          // instantiate computation graph
+          LossAttrs loss_attrs = LossAttrs{
+              NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+          OptimizerAttrs optimizer_attrs =
+              OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                               /*momentum=*/0.9,
+                                               /*nesterov=*/false,
+                                               /*weight_decay=*/0.001}};
+
+          std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+              input_tensors;
+
+          DistributedFfHandle device_handle = create_distributed_ff_handle(
+              ctx,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+
+          PCGInstance pcg_instance = create_pcg_instance(
+              /*ctx=*/ctx,
+              /*mpcg=*/mpcg,
+              /*optimizer=*/optimizer_attrs,
+              /*loss=*/std::nullopt,
+              /*input_tensors=*/input_tensors,
+              /*profiling_settings=*/ProfilingSettings{0, 0},
+              /*device_handle=*/device_handle,
+              /*iteration_config=*/FFIterationConfig{1_p});
+
+          // begin training loop
+          int num_epochs = 1;
+          for (int i = 0; i < num_epochs; i++) {
+            perform_all_passes_for_pcg_instance(
+                /*instance=*/pcg_instance,
+                /*profiling_settings=*/ProfilingSettings{0, 0},
+                /*device_handle=*/device_handle,
+                /*iteration_config=*/FFIterationConfig{1_p});
+          }
+        });
+    result.wait();
+  }
+}
+} // namespace test

From 34056217cbb4a8067e582a792fa8af726c8d712e Mon Sep 17 00:00:00 2001
From: Seema Mirchandaney <seemah@sapling2.stanford.edu>
Date: Thu, 9 Apr 2026 15:52:21 -0700
Subject: [PATCH 2/6] Add support for replicate op in distributed training

- Add perform_pass_expansion_for_replicate for fwd/bwd pass expansion
- Add perform_shard_expansion_for_replicate and _bwd for shard expansion
- Add build_replicate_invocation in make_dynamic_open_dataflow_graph
- Add is_replicate_attrs helper and guard replicate in copy_insertion
- Add ReplicateAttrs to TrainingOperationAttrs
- Add SumReductionFloat/Double for backward replicate reduce operation
- Add issue_replicate_bwd in spawn_dynamic_node_invocation
- Fix per_device_op_state init race condition with direct write
- Fix .value() calls on optional per_device_op_state across op impls
- Update issue_copy to support optional reduction op
- Add testcase for replicate op
---
 .../src/op-attrs/ops/element_unary.cc         |   1 -
 .../test/src/op-attrs/ops/element_unary.cc    |   8 -
 .../include/realm-execution/realm_context.h   |  19 +-
 .../include/realm-execution/sum_reduction.h   |  99 ----
 .../realm-execution/tasks/realm_reduction.h   |  49 +-
 ...uted_per_device_op_state_initialization.cc |   6 +-
 .../src/realm-execution/pcg_instance.cc       |  54 +++
 .../src/realm-execution/realm_context.cc      |   9 +-
 .../impl/per_device_op_state_init_task.cc     |  16 +-
 .../tasks/realm_task_registry.cc              |  10 +
 .../src/realm-execution/test_op_replicate.cc  | 444 +++++++++---------
 .../training_operation_attrs.dtg.toml         |   4 +
 .../task-spec/dynamic_graph/copy_insertion.cc |  47 +-
 ...mic_open_dataflow_graph_from_mapped_pcg.cc | 127 +++++
 .../task-spec/dynamic_graph/pass_expansion.cc |  43 ++
 .../dynamic_graph/shard_expansion.cc          | 125 ++++-
 .../src/task-spec/ops/impl/element_binary.cc  |   8 +-
 .../src/task-spec/ops/impl/element_unary.cc   |   8 +-
 18 files changed, 713 insertions(+), 364 deletions(-)
 delete mode 100644 lib/realm-execution/include/realm-execution/sum_reduction.h

diff --git a/lib/op-attrs/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/src/op-attrs/ops/element_unary.cc
index 9d02923689..ca7e417814 100644
--- a/lib/op-attrs/src/op-attrs/ops/element_unary.cc
+++ b/lib/op-attrs/src/op-attrs/ops/element_unary.cc
@@ -35,7 +35,6 @@ ParallelTensorDimDegrees get_output_parallel_dim_degrees(
     ElementUnaryAttrs const &attrs,
     ParallelTensorDimDegrees const &input_degrees) {
   ASSERT(input_degrees.sum_degree.value == 1);
-  ASSERT(input_degrees.discard_copy_degree.value == 1);
 
   return input_degrees;
 }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
index 672b160cbd..43b4be06d8 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
@@ -62,13 +62,5 @@ TEST_SUITE(FF_TEST_SUITE) {
               SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
     }
 
-    SUBCASE("discard copy degree > 1") {
-      positive_int degree = 2_p;
-
-      CHECK_THROWS(get_output_shape(
-          attrs,
-          make_input(
-              SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p)));
-    }
   }
 }
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index ab89e916c0..eab42d0d79 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -63,15 +63,18 @@ struct RealmContext {
                             int priority = 0);
   ///\}
 
-  /** \name Data movement */
+  /** \name Data movement and reduction */
   ///\{
-  Realm::Event issue_copy(ParallelTensorShape const &src_shape,
-                          Realm::RegionInstance src_inst,
-                          ParallelTensorShape const &dst_shape,
-                          Realm::RegionInstance dst_inst,
-                          Realm::ProfilingRequestSet const &requests,
-                          Realm::Event wait_on = Realm::Event::NO_EVENT,
-                          int priority = 0);
+  Realm::Event
+      issue_copy(ParallelTensorShape const &src_shape,
+                 Realm::RegionInstance src_inst,
+                 ParallelTensorShape const &dst_shape,
+                 Realm::RegionInstance dst_inst,
+                 Realm::ProfilingRequestSet const &requests,
+                 Realm::Event wait_on = Realm::Event::NO_EVENT,
+                 int priority = 0,
+                 std::optional<Realm::ReductionOpID> redop_id = std::nullopt,
+                 bool exclusive = false);
   ///\}
 
   /** \name Instance management */
diff --git a/lib/realm-execution/include/realm-execution/sum_reduction.h b/lib/realm-execution/include/realm-execution/sum_reduction.h
deleted file mode 100644
index b845b5b7f2..0000000000
--- a/lib/realm-execution/include/realm-execution/sum_reduction.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#pragma once
-#include <realm.h>
-#include "op-attrs/datatype.dtg.h"
-
-namespace FlexFlow {
-
-// Sum reduction for float
-struct SumReductionFloat {
-  using LHS = float;
-  using RHS = float;
-  static const RHS identity;
-
-  template <bool EXCLUSIVE>
-  static void apply(LHS &lhs, RHS rhs) {
-    if (EXCLUSIVE) {
-      lhs += rhs;
-    } else {
-      // atomic add for non-exclusive
-      __sync_fetch_and_add((int*)&lhs, *(int*)&rhs);  
-      // proper float atomic add — use union trick
-      union { float f; int i; } old_val, new_val;
-      do {
-        old_val.f = lhs;
-        new_val.f = old_val.f + rhs;
-      } while (!__sync_bool_compare_and_swap(
-          (int*)&lhs, old_val.i, new_val.i));
-    }
-  }
-
-  template <bool EXCLUSIVE>
-  static void fold(RHS &rhs1, RHS rhs2) {
-    if (EXCLUSIVE) {
-      rhs1 += rhs2;
-    } else {
-      union { float f; int i; } old_val, new_val;
-      do {
-        old_val.f = rhs1;
-        new_val.f = old_val.f + rhs2;
-      } while (!__sync_bool_compare_and_swap(
-          (int*)&rhs1, old_val.i, new_val.i));
-    }
-  }
-};
-
-const SumReductionFloat::RHS SumReductionFloat::identity = 0.0f;
-
-// Sum reduction for double
-struct SumReductionDouble {
-  using LHS = double;
-  using RHS = double;
-  static const RHS identity;
-
-  template <bool EXCLUSIVE>
-  static void apply(LHS &lhs, RHS rhs) {
-    if (EXCLUSIVE) {
-      lhs += rhs;
-    } else {
-      union { double d; long long i; } old_val, new_val;
-      do {
-        old_val.d = lhs;
-        new_val.d = old_val.d + rhs;
-      } while (!__sync_bool_compare_and_swap(
-          (long long*)&lhs, old_val.i, new_val.i));
-    }
-  }
-
-  template <bool EXCLUSIVE>
-  static void fold(RHS &rhs1, RHS rhs2) {
-    if (EXCLUSIVE) {
-      rhs1 += rhs2;
-    } else {
-      union { double d; long long i; } old_val, new_val;
-      do {
-        old_val.d = rhs1;
-        new_val.d = old_val.d + rhs2;
-      } while (!__sync_bool_compare_and_swap(
-          (long long*)&rhs1, old_val.i, new_val.i));
-    }
-  }
-};
-
-const SumReductionDouble::RHS SumReductionDouble::identity = 0.0;
-
-// Reduction op IDs — must not conflict with other registered redops
-enum SumReductionOpIDs {
-  REDOP_SUM_FLOAT  = 1,
-  REDOP_SUM_DOUBLE = 2,
-};
-
-inline Realm::ReductionOpID get_sum_reduction_op_id(DataType dtype) {
-  switch (dtype) {
-    case DataType::FLOAT:  return REDOP_SUM_FLOAT;
-    case DataType::DOUBLE: return REDOP_SUM_DOUBLE;
-    default:
-      PANIC("no sum reduction registered for datatype {}", dtype);
-  }
-}
-
-} // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h b/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
index d1d6e1d880..d9cf00441b 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
@@ -1,6 +1,6 @@
 #pragma once
-#include <realm.h>
 #include "op-attrs/datatype.dtg.h"
+#include <realm.h>
 
 namespace FlexFlow {
 
@@ -8,7 +8,7 @@ namespace FlexFlow {
 struct SumReductionFloat {
   using LHS = float;
   using RHS = float;
-  static constexpr RHS identity = 0.0f;  // ← inside struct, constexpr
+  static constexpr RHS identity = 0.0f; // ← inside struct, constexpr
 
   template <bool EXCLUSIVE>
   static void apply(LHS &lhs, RHS rhs) {
@@ -16,14 +16,17 @@ struct SumReductionFloat {
       lhs += rhs;
     } else {
       // atomic add for non-exclusive
-      __sync_fetch_and_add((int*)&lhs, *(int*)&rhs);  
+      __sync_fetch_and_add((int *)&lhs, *(int *)&rhs);
       // proper float atomic add — use union trick
-      union { float f; int i; } old_val, new_val;
+      union {
+        float f;
+        int i;
+      } old_val, new_val;
       do {
         old_val.f = lhs;
         new_val.f = old_val.f + rhs;
-      } while (!__sync_bool_compare_and_swap(
-          (int*)&lhs, old_val.i, new_val.i));
+      } while (
+          !__sync_bool_compare_and_swap((int *)&lhs, old_val.i, new_val.i));
     }
   }
 
@@ -32,34 +35,39 @@ struct SumReductionFloat {
     if (EXCLUSIVE) {
       rhs1 += rhs2;
     } else {
-      union { float f; int i; } old_val, new_val;
+      union {
+        float f;
+        int i;
+      } old_val, new_val;
       do {
         old_val.f = rhs1;
         new_val.f = old_val.f + rhs2;
-      } while (!__sync_bool_compare_and_swap(
-          (int*)&rhs1, old_val.i, new_val.i));
+      } while (
+          !__sync_bool_compare_and_swap((int *)&rhs1, old_val.i, new_val.i));
     }
   }
 };
 
-
 // Sum reduction for double
 struct SumReductionDouble {
   using LHS = double;
   using RHS = double;
-  static constexpr RHS identity = 0.0;  // ← inside struct, constexpr  
+  static constexpr RHS identity = 0.0; // ← inside struct, constexpr
 
   template <bool EXCLUSIVE>
   static void apply(LHS &lhs, RHS rhs) {
     if (EXCLUSIVE) {
       lhs += rhs;
     } else {
-      union { double d; long long i; } old_val, new_val;
+      union {
+        double d;
+        long long i;
+      } old_val, new_val;
       do {
         old_val.d = lhs;
         new_val.d = old_val.d + rhs;
       } while (!__sync_bool_compare_and_swap(
-          (long long*)&lhs, old_val.i, new_val.i));
+          (long long *)&lhs, old_val.i, new_val.i));
     }
   }
 
@@ -68,26 +76,31 @@ struct SumReductionDouble {
     if (EXCLUSIVE) {
       rhs1 += rhs2;
     } else {
-      union { double d; long long i; } old_val, new_val;
+      union {
+        double d;
+        long long i;
+      } old_val, new_val;
       do {
         old_val.d = rhs1;
         new_val.d = old_val.d + rhs2;
       } while (!__sync_bool_compare_and_swap(
-          (long long*)&rhs1, old_val.i, new_val.i));
+          (long long *)&rhs1, old_val.i, new_val.i));
     }
   }
 };
 
 // Reduction op IDs — must not conflict with other registered redops
 enum SumReductionOpIDs {
-  REDOP_SUM_FLOAT  = 1,
+  REDOP_SUM_FLOAT = 1,
   REDOP_SUM_DOUBLE = 2,
 };
 
 inline Realm::ReductionOpID get_sum_reduction_op_id(DataType dtype) {
   switch (dtype) {
-    case DataType::FLOAT:  return REDOP_SUM_FLOAT;
-    case DataType::DOUBLE: return REDOP_SUM_DOUBLE;
+    case DataType::FLOAT:
+      return REDOP_SUM_FLOAT;
+    case DataType::DOUBLE:
+      return REDOP_SUM_DOUBLE;
     default:
       PANIC("no sum reduction registered for datatype {}", dtype);
   }
diff --git a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
index 1d517a8fe4..e7d8647b12 100644
--- a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
+++ b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
@@ -31,6 +31,7 @@ PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
   std::unordered_map<DynamicNodeInvocation,
                      DeviceSpecificPtr<PerDeviceOpState> *>
       device_state_map;
+  std::vector<Realm::Event> completion_events;
   for (DynamicNodeInvocation const &invocation : dg.invocations) {
     Realm::Processor target_proc = ctx.map_device_coord_to_processor(
         assert_unwrap(invocation.node_attrs.device_coord));
@@ -56,6 +57,7 @@ PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
                                             precondition);
 
     if (completion_event.has_value()) {
+      completion_events.push_back(completion_event.value());
       device_state_map.insert(std::pair{invocation, device_state_ptr});
     } else {
       // Task doesn't require initialization, clean up and don't store result
@@ -63,7 +65,9 @@ PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
     }
   }
 
-  ctx.get_outstanding_events().wait();
+  // wait for all init tasks — direct write to *result_ptr happens
+  // before each init task event fires so result is ready after this
+  Realm::Event::merge_events(completion_events).wait();
 
   auto deref = [](DeviceSpecificPtr<PerDeviceOpState> *const &p) { return *p; };
   std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance.cc
index 0ecd02143e..a0653c3c37 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance.cc
@@ -6,6 +6,7 @@
 #include "realm-execution/instance_allocation.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tasks/realm_reduction.h"
 #include "realm-execution/tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/copy_insertion.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
@@ -215,6 +216,46 @@ static Realm::Event spawn_dynamic_node_invocation(
                           precondition);
   };
 
+  // issue_replicate_bwd lambda
+  auto issue_replicate_bwd = [&]() {
+    std::optional<DynamicValueAttrs> output_grad_opt;
+    for (auto const &[slot, value] : invocation.inputs) {
+      if (slot.slot_tensor_role == DynamicTensorRole{FwbTensorType::GRADIENT}) {
+        output_grad_opt = value;
+      }
+    }
+    DynamicValueAttrs output_grad = assert_unwrap(output_grad_opt);
+    DynamicValueAttrs input_grad = get_only(invocation.outputs).second;
+    Realm::RegionInstance dst_inst =
+        tensor_instance_backing.backing.at(input_grad).first;
+
+    Realm::ReductionOpID redop_id = get_sum_reduction_op_id(
+        assert_unwrap(output_grad.parallel_tensor_shape).data_type);
+
+    // chain reductions sequentially to avoid write races on dst
+    Realm::Event e = precondition;
+    for (auto const &[p, m] : assert_unwrap(output_grad.mapping)) {
+      DynamicValueAttrs replica_key = output_grad;
+      replica_key.mapping =
+          bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{{p, m}};
+      replica_key.shard_coord = p;
+
+      Realm::RegionInstance src_inst =
+          tensor_instance_backing.backing.at(replica_key).first;
+
+      e = ctx.issue_copy(assert_unwrap(output_grad.parallel_tensor_shape),
+                         src_inst,
+                         assert_unwrap(input_grad.parallel_tensor_shape),
+                         dst_inst,
+                         Realm::ProfilingRequestSet{},
+                         e,
+                         0,
+                         redop_id,
+                         false);
+    }
+    return e;
+  };
+
   TrainingOperationAttrs op_attrs =
       assert_unwrap(invocation.node_attrs.op_attrs);
   return op_attrs.visit<Realm::Event>(overload{
@@ -222,11 +263,24 @@ static Realm::Event spawn_dynamic_node_invocation(
         return pcg_op_attrs.visit<Realm::Event>(overload{
             [&](InputAttrs const &) { return Realm::Event::NO_EVENT; },
             [&](WeightAttrs const &) { return Realm::Event::NO_EVENT; },
+            [&](ReplicateAttrs const &) {
+              // this should never be reached since replicate
+              // goes through TrainingOperationAttrs::ReplicateAttrs
+              PANIC("unexpected replicate in PCGOperatorAttrs path");
+              return Realm::Event::NO_EVENT;
+            },
             [&](auto const &) { return spawn_task(); },
         });
       },
       [&](LossAttrs const &) { return spawn_task(); },
       [&](CopyAttrs const &) { return issue_copy(); },
+      [&](ReplicateAttrs const &) {
+        if (invocation.node_attrs.task_type.has_value() &&
+            invocation.node_attrs.task_type.value() == DynamicTaskType::BWD) {
+          return issue_replicate_bwd();
+        }
+        return issue_copy();
+      },
   });
 }
 
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index 790c1bd613..a4669bf43e 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -161,7 +161,9 @@ Realm::Event
                              Realm::RegionInstance dst_inst,
                              Realm::ProfilingRequestSet const &requests,
                              Realm::Event wait_on,
-                             int priority) {
+                             int priority,
+                             std::optional<Realm::ReductionOpID> redop_id,
+                             bool exclusive) {
   TensorShape src_piece_shape = get_piece_shape(src_shape);
   TensorShape dst_piece_shape = get_piece_shape(dst_shape);
   ASSERT(src_piece_shape == dst_piece_shape); // For now, assume they match
@@ -183,6 +185,11 @@ Realm::Event
           size_of_datatype(src_piece_shape.data_type).int_from_positive_int()),
       /*subfield_offset=*/0);
 
+  // set reduction op on dst field if provided
+  if (redop_id.has_value()) {
+    dst_field.set_redop(redop_id.value(), /*is_fold=*/false, exclusive);
+  }
+
   Realm::Event result;
   switch (src_piece_shape.dims.ff_ordered.num_dims()) {
 #if REALM_MAX_DIM >= 1
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
index 753fccf74b..0ea51810e4 100644
--- a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
@@ -66,11 +66,17 @@ void per_device_op_state_init_task_body(void const *args,
           result_state, ctx.get_current_device_idx())};
   DeviceSpecificPtr<PerDeviceOpState> result_device_specific{
       ctx.get_current_device_idx(), result_state_ptr};
-  spawn_per_device_op_state_init_return_task(ctx,
-                                             task_args.origin_proc,
-                                             result_device_specific,
-                                             task_args.origin_result_ptr,
-                                             Realm::Event::NO_EVENT);
+
+  // replace spawn_per_device_op_state_init_return_task with:
+  // NOTE: SM/TODO: direct write assumes single-node shared address space
+  // For multi-node, replace with UserEvent trigger pattern
+  *task_args.origin_result_ptr = result_device_specific;
+
+  //  spawn_per_device_op_state_init_return_task(ctx,
+  //                                             task_args.origin_proc,
+  //                                             result_device_specific,
+  //                                             task_args.origin_result_ptr,
+  //                                             Realm::Event::NO_EVENT);
 }
 
 std::optional<Realm::Event> spawn_per_device_op_state_init_task(
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index e7a8948f8d..acafdf59fd 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -5,6 +5,7 @@
 #include "realm-execution/tasks/impl/op_task.h"
 #include "realm-execution/tasks/impl/per_device_op_state_init_return_task.h"
 #include "realm-execution/tasks/impl/per_device_op_state_init_task.h"
+#include "realm-execution/tasks/realm_reduction.h"
 #include "realm-execution/tasks/task_id_t.h"
 #include "utils/exception.h"
 
@@ -30,9 +31,18 @@ Realm::Event register_task(Realm::Processor::Kind target_kind,
       Realm::ProfilingRequestSet());
 }
 
+static void register_reductions() {
+  // register sum reduction ops
+  Realm::Runtime rt = Realm::Runtime::get_runtime();
+  rt.register_reduction<SumReductionFloat>(REDOP_SUM_FLOAT);
+  rt.register_reduction<SumReductionDouble>(REDOP_SUM_DOUBLE);
+  // register_reduction is synchronous — no event returned
+}
+
 Realm::Event register_all_tasks() {
   std::vector<Realm::Event> pending_registrations;
 
+  register_reductions();
   std::vector<task_id_t> init_task_ids = {
       // Init tasks
       task_id_t::BATCHNORM_INIT_TASK_ID,
diff --git a/lib/realm-execution/test/src/realm-execution/test_op_replicate.cc b/lib/realm-execution/test/src/realm-execution/test_op_replicate.cc
index d1fc941007..632f08d239 100644
--- a/lib/realm-execution/test/src/realm-execution/test_op_replicate.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_op_replicate.cc
@@ -56,194 +56,207 @@ TEST_SUITE(FF_TEST_SUITE) {
     char **fake_argv = fake_args.data();
 
     RealmManager manager = RealmManager{&fake_argc, &fake_argv};
-    ControllerTaskResult result = manager.start_controller([](RealmContext
-                                                                  &ctx) {
-      Allocator allocator = ctx.get_current_device_allocator();
-
-      positive_int batch_size = 10_p;
-      positive_int data_dim = 16_p;
-      positive_int hidden_dim = 32_p;
-      positive_int output_dim = 1_p;
-
-      // 10,2
-      TensorShape output_tensor_shape = TensorShape{
-          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
-
-      // 10,2
-      TensorShape label_tensor_shape = TensorShape{
-          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
-
-      GenericTensorAccessorW label_tensor =
-          allocator.allocate_tensor(label_tensor_shape);
-
-      // construct computation graph
-      ParallelComputationGraph pcg = empty_parallel_computation_graph();
-
-      // input tensor
-      // 10, 16
-      TensorShape input_tensor_shape = TensorShape{
-          TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
-
-      // parallel layer -> input tensor
-      ParallelLayerAddedResult inputs_layer =
-          pcg_add_input_layer(pcg, input_tensor_shape);
-      parallel_tensor_guid_t t_input =
-          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
-
-      // parallel layer -> input tensor 2
-      ParallelLayerAddedResult inputs_layer_2 =
-          pcg_add_input_layer(pcg, input_tensor_shape);
-      parallel_tensor_guid_t t_input_2 =
-          require_only_key(inputs_layer_2.outputs, TensorSlotName::OUTPUT);
-
-      // binary  ADD attribute
-      ElementBinaryAttrs add_attrs = ElementBinaryAttrs{
-          OperatorType::EW_ADD,
-          DataType::FLOAT,
-          false,
-          false,
-      };
-
-      // parallel layer -> perform add
-      ParallelLayerAddedResult add_operator_1 =
-          add_parallel_layer(pcg, make_layer_attrs(add_attrs),
-                             {
-                                 {
-                                     TensorSlotName::LHS_INPUT,
-                                     t_input,
-                                 },
+    ControllerTaskResult result =
+        manager.start_controller([](RealmContext &ctx) {
+          Allocator allocator = ctx.get_current_device_allocator();
+
+          positive_int batch_size = 10_p;
+          positive_int data_dim = 16_p;
+          positive_int hidden_dim = 32_p;
+          positive_int output_dim = 1_p;
+
+          // 10,2
+          TensorShape output_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+          // 10,2
+          TensorShape label_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+          GenericTensorAccessorW label_tensor =
+              allocator.allocate_tensor(label_tensor_shape);
+
+          // construct computation graph
+          ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+          // input tensor
+          // 10, 16
+          TensorShape input_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+          // parallel layer -> input tensor
+          ParallelLayerAddedResult inputs_layer =
+              pcg_add_input_layer(pcg, input_tensor_shape);
+          parallel_tensor_guid_t t_input =
+              require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+          // parallel layer -> input tensor 2
+          ParallelLayerAddedResult inputs_layer_2 =
+              pcg_add_input_layer(pcg, input_tensor_shape);
+          parallel_tensor_guid_t t_input_2 =
+              require_only_key(inputs_layer_2.outputs, TensorSlotName::OUTPUT);
+
+          // binary  ADD attribute
+          ElementBinaryAttrs add_attrs = ElementBinaryAttrs{
+              OperatorType::EW_ADD,
+              DataType::FLOAT,
+              false,
+              false,
+          };
+
+          // parallel layer -> perform add
+          ParallelLayerAddedResult add_operator_1 =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(add_attrs),
                                  {
-                                     TensorSlotName::RHS_INPUT,
-                                     t_input_2,
+                                     {
+                                         TensorSlotName::LHS_INPUT,
+                                         t_input,
+                                     },
+                                     {
+                                         TensorSlotName::RHS_INPUT,
+                                         t_input_2,
+                                     },
                                  },
-                             },
-                             {/* weight */});
-
-      parallel_tensor_guid_t t_add_1 =
-          require_only_key(add_operator_1.outputs, TensorSlotName::OUTPUT);
-
-      // parallel layer -> perform replicate
-      const positive_int replicate_degree = 2_p;
-      ReplicateAttrs repl_attrs = ReplicateAttrs(replicate_degree);
-      ParallelLayerAddedResult repl_operator_1 =
-          add_parallel_layer(pcg, make_layer_attrs(repl_attrs),
-                             {
+                                 {/* weight */});
+
+          parallel_tensor_guid_t t_add_1 =
+              require_only_key(add_operator_1.outputs, TensorSlotName::OUTPUT);
+
+          // parallel layer -> perform replicate
+          const positive_int replicate_degree = 2_p;
+          ReplicateAttrs repl_attrs = ReplicateAttrs(replicate_degree);
+          ParallelLayerAddedResult repl_operator_1 =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(repl_attrs),
                                  {
-                                     TensorSlotName::INPUT,
-                                     t_add_1,
+                                     {
+                                         TensorSlotName::INPUT,
+                                         t_add_1,
+                                     },
                                  },
-                             },
-                             /*weight=*/{});
-      // output of replicate layer
-      parallel_tensor_guid_t t_repl_1 =
-          require_only_key(repl_operator_1.outputs, TensorSlotName::OUTPUT);
-
-      // parallel layer -> perform  RelU
-      ParallelLayerAddedResult relu_operator_1 =
-          add_parallel_layer(pcg, make_layer_attrs(make_relu_attrs()),
-                             /*inputs=*/
-                             {
+                                 /*weight=*/{});
+          // output of replicate layer
+          parallel_tensor_guid_t t_repl_1 =
+              require_only_key(repl_operator_1.outputs, TensorSlotName::OUTPUT);
+
+          // parallel layer -> perform  RelU
+          ParallelLayerAddedResult relu_operator_1 =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(make_relu_attrs()),
+                                 /*inputs=*/
                                  {
-                                     TensorSlotName::INPUT,
-                                     t_repl_1,
+                                     {
+                                         TensorSlotName::INPUT,
+                                         t_repl_1,
+                                     },
                                  },
-                             },
-                             /*weights=*/{});
-      // output of relu layer
-      parallel_tensor_guid_t t_relu_1 =
-          require_only_key(relu_operator_1.outputs, TensorSlotName::OUTPUT);
-
-      // machine
-      MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
-      MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
-
-      ParallelTensorSpaceCoordinate tensor_coord0{
-          /* sum_component */ 0_n, /* discard_copy_component */ 0_n,
-          /*shard_component*/ FFOrdered{0_n}};
-      ParallelTensorSpaceCoordinate tensor_coord1{
-          /* sum_component */ 0_n, /* discard_copy_component */ 1_n,
-          /*shard_component*/ FFOrdered{0_n}};
-      MappedParallelComputationGraph mpcg{
-          pcg,
-          {{inputs_layer.parallel_layer,
-            MappedOperatorTaskGroup{
-                {{cpu0, OperatorAtomicTaskShardBinding{{{TensorSlotName::OUTPUT,
-                                                         tensor_coord0}}}}}}},
-           {inputs_layer_2.parallel_layer,
-            MappedOperatorTaskGroup{
-                {{cpu0, OperatorAtomicTaskShardBinding{{{TensorSlotName::OUTPUT,
-                                                         tensor_coord0}}}}}}},
-           {add_operator_1.parallel_layer,
-            MappedOperatorTaskGroup{
-                {{cpu0, OperatorAtomicTaskShardBinding{{
-                            {TensorSlotName::LHS_INPUT, tensor_coord0},
-                            {TensorSlotName::RHS_INPUT, tensor_coord0},
-                            {TensorSlotName::OUTPUT, tensor_coord0},
-                        }}}}}},
-           {repl_operator_1.parallel_layer,
-            MappedOperatorTaskGroup{{
-                {cpu0, OperatorAtomicTaskShardBinding{{
-                           {TensorSlotName::OUTPUT, tensor_coord0},
-                       }}},
-                {cpu1, OperatorAtomicTaskShardBinding{{
-                           {TensorSlotName::OUTPUT, tensor_coord1},
-                       }}},
-				     }}},
-	   {relu_operator_1.parallel_layer,
-                 MappedOperatorTaskGroup{{
-                     {cpu0, OperatorAtomicTaskShardBinding{{
-                                {TensorSlotName::INPUT, tensor_coord0},
-                                {TensorSlotName::OUTPUT, tensor_coord0},
-                            }}},
-                     {cpu1, OperatorAtomicTaskShardBinding{{
-                                {TensorSlotName::INPUT, tensor_coord1},
-                                {TensorSlotName::OUTPUT, tensor_coord1},
-                            }}},
-                 }}}},
-      };
-
-      MappedOperatorTaskGroup loss_mapping{
-          {{cpu0, OperatorAtomicTaskShardBinding{{
-                      {TensorSlotName::INPUT, tensor_coord0},
-                      {TensorSlotName::LOGIT, tensor_coord0},
-                  }}}}};
-
-      // instantiate computation graph
-      LossAttrs loss_attrs = LossAttrs{
-          NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-      OptimizerAttrs optimizer_attrs =
-          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                           /*momentum=*/0.9,
-                                           /*nesterov=*/false,
-                                           /*weight_decay=*/0.001}};
-
-      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
-          input_tensors;
-
-      DistributedFfHandle device_handle =
-          create_distributed_ff_handle(ctx,
-                                       /*workSpaceSize=*/1024 * 1024,
-                                       /*allowTensorOpMathConversion=*/true);
-      PCGInstance pcg_instance = create_pcg_instance(
-          /*ctx=*/ctx,
-          /*mpcg=*/mpcg,
-          /*optimizer=*/optimizer_attrs,
-          /*loss=*/std::nullopt,
-          /*input_tensors=*/input_tensors,
-          /*profiling_settings=*/ProfilingSettings{0, 0},
-          /*device_handle=*/device_handle,
-          /*iteration_config=*/FFIterationConfig{1_p});
-
-      // begin training loop
-      int num_epochs = 1;
-      for (int i = 0; i < num_epochs; i++) {
-        perform_all_passes_for_pcg_instance(
-            /*instance=*/pcg_instance,
-            /*profiling_settings=*/ProfilingSettings{0, 0},
-            /*device_handle=*/device_handle,
-            /*iteration_config=*/FFIterationConfig{1_p});
-      }
-    });
+                                 /*weights=*/{});
+          // output of relu layer
+          parallel_tensor_guid_t t_relu_1 =
+              require_only_key(relu_operator_1.outputs, TensorSlotName::OUTPUT);
+
+          // machine
+          MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+          MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
+
+          ParallelTensorSpaceCoordinate tensor_coord0{
+              /* sum_component */ 0_n,
+              /* discard_copy_component */ 0_n,
+              /*shard_component*/ FFOrdered{0_n}};
+          ParallelTensorSpaceCoordinate tensor_coord1{
+              /* sum_component */ 0_n,
+              /* discard_copy_component */ 1_n,
+              /*shard_component*/ FFOrdered{0_n}};
+          MappedParallelComputationGraph mpcg{
+              pcg,
+              {{inputs_layer.parallel_layer,
+                MappedOperatorTaskGroup{
+                    {{cpu0,
+                      OperatorAtomicTaskShardBinding{
+                          {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+               {inputs_layer_2.parallel_layer,
+                MappedOperatorTaskGroup{
+                    {{cpu0,
+                      OperatorAtomicTaskShardBinding{
+                          {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+               {add_operator_1.parallel_layer,
+                MappedOperatorTaskGroup{
+                    {{cpu0,
+                      OperatorAtomicTaskShardBinding{{
+                          {TensorSlotName::LHS_INPUT, tensor_coord0},
+                          {TensorSlotName::RHS_INPUT, tensor_coord0},
+                          {TensorSlotName::OUTPUT, tensor_coord0},
+                      }}}}}},
+               {repl_operator_1.parallel_layer,
+                MappedOperatorTaskGroup{{
+                    {cpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}},
+                    {cpu1,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::OUTPUT, tensor_coord1},
+                     }}},
+                }}},
+               {relu_operator_1.parallel_layer,
+                MappedOperatorTaskGroup{{
+                    {cpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}},
+                    {cpu1,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord1},
+                         {TensorSlotName::OUTPUT, tensor_coord1},
+                     }}},
+                }}}},
+          };
+
+          MappedOperatorTaskGroup loss_mapping{
+              {{cpu0,
+                OperatorAtomicTaskShardBinding{{
+                    {TensorSlotName::INPUT, tensor_coord0},
+                    {TensorSlotName::LOGIT, tensor_coord0},
+                }}}}};
+
+          // instantiate computation graph
+          LossAttrs loss_attrs = LossAttrs{
+              NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+          OptimizerAttrs optimizer_attrs =
+              OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                               /*momentum=*/0.9,
+                                               /*nesterov=*/false,
+                                               /*weight_decay=*/0.001}};
+
+          std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+              input_tensors;
+
+          DistributedFfHandle device_handle = create_distributed_ff_handle(
+              ctx,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+          PCGInstance pcg_instance = create_pcg_instance(
+              /*ctx=*/ctx,
+              /*mpcg=*/mpcg,
+              /*optimizer=*/optimizer_attrs,
+              /*loss=*/std::nullopt,
+              /*input_tensors=*/input_tensors,
+              /*profiling_settings=*/ProfilingSettings{0, 0},
+              /*device_handle=*/device_handle,
+              /*iteration_config=*/FFIterationConfig{1_p});
+
+          // begin training loop
+          int num_epochs = 1;
+          for (int i = 0; i < num_epochs; i++) {
+            perform_all_passes_for_pcg_instance(
+                /*instance=*/pcg_instance,
+                /*profiling_settings=*/ProfilingSettings{0, 0},
+                /*device_handle=*/device_handle,
+                /*iteration_config=*/FFIterationConfig{1_p});
+          }
+        });
     result.wait();
   }
 }
@@ -307,7 +320,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
           // parallel layer -> perform add
           ParallelLayerAddedResult add_operator_1 =
-              add_parallel_layer(pcg, make_layer_attrs(add_attrs),
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(add_attrs),
                                  {
                                      {
                                          TensorSlotName::LHS_INPUT,
@@ -327,7 +341,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           const positive_int replicate_degree = 2_p;
           ReplicateAttrs repl_attrs = ReplicateAttrs(replicate_degree);
           ParallelLayerAddedResult repl_operator_1 =
-              add_parallel_layer(pcg, make_layer_attrs(repl_attrs),
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(repl_attrs),
                                  {
                                      {
                                          TensorSlotName::INPUT,
@@ -341,7 +356,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
           // parallel layer -> perform  RelU
           ParallelLayerAddedResult relu_operator_1 =
-              add_parallel_layer(pcg, make_layer_attrs(make_relu_attrs()),
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(make_relu_attrs()),
                                  /*inputs=*/
                                  {
                                      {
@@ -357,8 +373,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           // machine
           MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
           MachineSpaceCoordinate gpu1{0_n, 1_n, DeviceType::GPU};
-	  ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
-	  ParallelTensorSpaceCoordinate tensor_coord1{0_n, 1_n, FFOrdered{0_n}};
+          ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+          ParallelTensorSpaceCoordinate tensor_coord1{0_n, 1_n, FFOrdered{0_n}};
           MappedParallelComputationGraph mpcg{
               pcg,
               {
@@ -374,38 +390,44 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                              {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
                   {add_operator_1.parallel_layer,
                    MappedOperatorTaskGroup{
-                       {{gpu0, OperatorAtomicTaskShardBinding{{
-                                   {TensorSlotName::LHS_INPUT, tensor_coord0},
-                                   {TensorSlotName::RHS_INPUT, tensor_coord0},
-                                   {TensorSlotName::OUTPUT, tensor_coord0},
-                               }}}}}},
+                       {{gpu0,
+                         OperatorAtomicTaskShardBinding{{
+                             {TensorSlotName::LHS_INPUT, tensor_coord0},
+                             {TensorSlotName::RHS_INPUT, tensor_coord0},
+                             {TensorSlotName::OUTPUT, tensor_coord0},
+                         }}}}}},
                   {repl_operator_1.parallel_layer,
-		   MappedOperatorTaskGroup{{
-		    {gpu0, OperatorAtomicTaskShardBinding{{
-                           {TensorSlotName::OUTPUT, tensor_coord0},
-                       }}},
-                     {gpu1, OperatorAtomicTaskShardBinding{{
-                           {TensorSlotName::OUTPUT, tensor_coord1},					   
-		    }}}}}},
+                   MappedOperatorTaskGroup{
+                       {{gpu0,
+                         OperatorAtomicTaskShardBinding{{
+                             {TensorSlotName::OUTPUT, tensor_coord0},
+                         }}},
+                        {gpu1,
+                         OperatorAtomicTaskShardBinding{{
+                             {TensorSlotName::OUTPUT, tensor_coord1},
+                         }}}}}},
                   {relu_operator_1.parallel_layer,
                    MappedOperatorTaskGroup{{
-                       {gpu0, OperatorAtomicTaskShardBinding{{
-                                  {TensorSlotName::INPUT, tensor_coord0},
-                                  {TensorSlotName::OUTPUT, tensor_coord0},
-                              }}},
-                       {gpu1, OperatorAtomicTaskShardBinding{{
-                                  {TensorSlotName::INPUT, tensor_coord1},
-                                  {TensorSlotName::OUTPUT, tensor_coord1},
-                              }}},
+                       {gpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord0},
+                            {TensorSlotName::OUTPUT, tensor_coord0},
+                        }}},
+                       {gpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord1},
+                            {TensorSlotName::OUTPUT, tensor_coord1},
+                        }}},
                    }}},
               },
           };
 
           MappedOperatorTaskGroup loss_mapping{
-              {{gpu0, OperatorAtomicTaskShardBinding{{
-                          {TensorSlotName::INPUT, tensor_coord0},
-                          {TensorSlotName::LOGIT, tensor_coord0},
-                      }}}}};
+              {{gpu0,
+                OperatorAtomicTaskShardBinding{{
+                    {TensorSlotName::INPUT, tensor_coord0},
+                    {TensorSlotName::LOGIT, tensor_coord0},
+                }}}}};
 
           // instantiate computation graph
           LossAttrs loss_attrs = LossAttrs{
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
index 8f8f6467c8..2bd0714512 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
@@ -25,3 +25,7 @@ key = "loss"
 [[values]]
 type = "::FlexFlow::CopyAttrs"
 key = "copy"
+
+[[values]]
+type = "::FlexFlow::ReplicateAttrs"
+key = "replicate"
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
index 4c1b9d4609..7a28e254aa 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
@@ -25,15 +25,43 @@ bool node_is_copy(DynamicNodeAttrs const &n) {
   return n.op_attrs.has_value() && n.op_attrs.value().is_copy();
 }
 
+static bool is_replicate_invocation(DynamicNodeInvocation const &i) {
+  if (!i.node_attrs.op_attrs.has_value()) {
+    return false;
+  }
+  TrainingOperationAttrs const &op_attrs = i.node_attrs.op_attrs.value();
+  if (op_attrs.is_replicate()) {
+    return true;
+  }
+  return false;
+}
+
 bool value_is_mapped(DynamicValueAttrs const &n) {
   return n.mapping.has_value();
 }
 
 bool no_part_of_graph_is_copy_inserted(DynamicOpenDataflowGraph const &g) {
   auto slot_is_mapped = [](DynamicTensorSlot const &) -> bool { return false; };
-
-  return no_part_of_dynamic_graph_satisfies(
-      g, node_is_copy, value_is_mapped, slot_is_mapped);
+  // check all non-replicate invocations
+  for (DynamicNodeInvocation const &i : g.invocations) {
+    if (is_replicate_invocation(i)) {
+      continue; // replicate tensors have mapping set by design
+    }
+    if (node_is_copy(i.node_attrs)) {
+      return false;
+    }
+    for (auto const &[slot, value] : i.inputs) {
+      if (value_is_mapped(value)) {
+        return false;
+      }
+    }
+    for (auto const &[slot, value] : i.outputs) {
+      if (value_is_mapped(value)) {
+        return false;
+      }
+    }
+  }
+  return true;
 }
 
 bool graph_is_fully_copy_inserted(DynamicOpenDataflowGraph const &g) {
@@ -85,6 +113,11 @@ std::unordered_set<DynamicNodeInvocation> perform_copy_insertion_for_invocation(
     std::unordered_map<DynamicValueAttrs, DynamicValueAttrs> const
         &unmapped_value_to_mapped_source_value) {
 
+  // replicate nodes have no MappedOperatorTaskGroup —
+  // pass through unchanged, no copies needed
+  if (is_replicate_invocation(i)) {
+    return {i};
+  }
   MappedOperatorTaskGroup mapping = assert_unwrap(i.node_attrs.mapping);
 
   auto map_tensor = [&](DynamicTensorSlot const &slot,
@@ -157,6 +190,14 @@ DynamicOpenDataflowGraph
   std::unordered_map<DynamicValueAttrs, DynamicValueAttrs>
       unmapped_value_to_mapped_source_value;
   for (DynamicNodeInvocation const &i : g.invocations) {
+    // replicate nodes have no MappedOperatorTaskGroup —
+    // output mapping already fully set, maps to itself
+    if (is_replicate_invocation(i)) {
+      for (auto const &[slot, value] : i.outputs) {
+        unmapped_value_to_mapped_source_value.insert(std::pair{value, value});
+      }
+      continue;
+    }
     for (auto const &[slot, value] : i.outputs) {
       unmapped_value_to_mapped_source_value.insert(
           std::pair{value,
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
index 246f9a3242..3d48a0dc2b 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
@@ -7,11 +7,129 @@
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_role.h"
 #include "utils/containers/generate_map.h"
+#include "utils/containers/get_only.h"
 #include <optional>
 #include <unordered_map>
 #include <utility>
 
 namespace FlexFlow {
+static bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+    get_input_mapping_for_replicate(
+        MappedParallelComputationGraph const &mpcg,
+        parallel_layer_guid_t const &replicate_layer) {
+
+  auto [input_slot_name, input_tensor_guid] =
+      get_only(get_incoming_tensors(mpcg.pcg, replicate_layer));
+
+  // find the layer that produces this tensor
+  for (auto const &[layer, _] : get_parallel_layer_attrs_mapping(mpcg.pcg)) {
+    for (auto const &[slot_name, t] : get_outgoing_tensors(mpcg.pcg, layer)) {
+      if (t == input_tensor_guid) {
+        MappedOperatorTaskGroup producer_mapping = mpcg.mapped_tasks.at(layer);
+        return get_tensor_bindings_for_slot_name(producer_mapping, slot_name);
+      }
+    }
+  }
+
+  PANIC("could not find producer of replicate layer input tensor");
+}
+
+static std::unordered_map<parallel_layer_guid_t, TensorSlotName>
+    get_consumers_of_tensor(MappedParallelComputationGraph const &mpcg,
+                            parallel_tensor_guid_t const &tensor) {
+  std::unordered_map<parallel_layer_guid_t, TensorSlotName> result;
+  for (auto const &[layer, _] : get_parallel_layer_attrs_mapping(mpcg.pcg)) {
+    for (auto const &[slot_name, t] : get_incoming_tensors(mpcg.pcg, layer)) {
+      if (t == tensor) {
+        result.insert({layer, slot_name});
+      }
+    }
+  }
+  return result;
+}
+
+static bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+    build_replicated_output_mapping(
+        MappedParallelComputationGraph const &mpcg,
+        parallel_layer_guid_t const &replicate_layer) {
+
+  auto [output_slot_name, output_tensor_guid] =
+      get_only(get_outgoing_tensors(mpcg.pcg, replicate_layer));
+
+  auto consumers = get_consumers_of_tensor(mpcg, output_tensor_guid);
+  ASSERT(!consumers.empty());
+
+  // union all consumer bindings — each consumer shard maps to a distinct
+  // (discard_copy, machine) pair since replicas are always on different machines
+  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> result;
+  for (auto const &[consumer_layer, slot_name] : consumers) {
+    MappedOperatorTaskGroup consumer_mapping =
+        mpcg.mapped_tasks.at(consumer_layer);
+    bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> binding =
+        get_tensor_bindings_for_slot_name(consumer_mapping, slot_name);
+    for (auto const &[p, m] : binding) {
+      result.equate(p, m);
+    }
+  }
+  return result;
+}
+
+static DynamicNodeInvocation
+    build_replicate_invocation(parallel_layer_guid_t const &layer,
+                               ParallelLayerAttrs const &attrs,
+                               MappedParallelComputationGraph const &mpcg) {
+  auto [input_slot_name, input_tensor_guid] =
+      get_only(get_incoming_tensors(mpcg.pcg, layer));
+  auto incoming = get_incoming_tensors(mpcg.pcg, layer);
+  ASSERT(!incoming.empty(),
+         "replicate layer has no incoming tensors — "
+         "check PCG edge construction in test");
+
+  ParallelTensorAttrs input_attrs =
+      get_parallel_tensor_attrs(mpcg.pcg, input_tensor_guid);
+  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> input_mapping =
+      get_input_mapping_for_replicate(mpcg, layer);
+
+  DynamicValueAttrs input_value{
+      /*tensor_guid=*/dynamic_tensor_guid_t{input_tensor_guid},
+      /*parallel_tensor_shape=*/input_attrs.shape,
+      /*shard_coord=*/std::nullopt,
+      /*mapping=*/get_input_mapping_for_replicate(mpcg, layer),
+      /*accessor=*/std::nullopt,
+      /*role=*/std::nullopt,
+  };
+
+  auto [output_slot_name, output_tensor_guid] =
+      get_only(get_outgoing_tensors(mpcg.pcg, layer));
+  ParallelTensorAttrs output_attrs =
+      get_parallel_tensor_attrs(mpcg.pcg, output_tensor_guid);
+
+  DynamicValueAttrs output_value{
+      /*tensor_guid=*/dynamic_tensor_guid_t{output_tensor_guid},
+      /*parallel_tensor_shape=*/output_attrs.shape,
+      /*shard_coord=*/std::nullopt,
+      /*mapping=*/build_replicated_output_mapping(mpcg, layer),
+      /*accessor=*/std::nullopt,
+      /*role=*/std::nullopt,
+  };
+  DynamicNodeAttrs node_attrs{
+      /*task_type=*/std::nullopt,
+      /*device_coord=*/std::nullopt,
+      /*mapping=*/std::nullopt,
+      /*op_attrs=*/TrainingOperationAttrs{attrs.op_attrs.get<ReplicateAttrs>()},
+      /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
+      /*per_device_op_state=*/std::nullopt,
+  };
+
+  DynamicNodeInvocation invocation_node{
+      /*inputs=*/{
+          {DynamicTensorSlot{input_slot_name, std::nullopt}, input_value}},
+      /*node_attrs=*/node_attrs,
+      /*outputs=*/
+      {{DynamicTensorSlot{output_slot_name, std::nullopt}, output_value}},
+  };
+  return invocation_node;
+}
 
 DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mapped_pcg(
     MappedParallelComputationGraph const &mpcg) {
@@ -19,6 +137,15 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mapped_pcg(
 
   for (auto const &[layer, attrs] :
        get_parallel_layer_attrs_mapping(mpcg.pcg)) {
+
+    if (attrs.op_attrs.has<ReplicateAttrs>()) {
+      // build replicate invocation
+      DynamicNodeInvocation repl_inv =
+          build_replicate_invocation(layer, attrs, mpcg);
+      result.invocations.emplace(repl_inv);
+      continue;
+    }
+
     DynamicNodeAttrs result_attrs{
         /*task_type=*/std::nullopt,
         /*device_coord=*/std::nullopt,
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
index 0cee06368f..aed5f2c4c3 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
@@ -4,6 +4,7 @@
 #include "utils/containers/are_all_same.h"
 #include "utils/containers/merge_disjoint_maps.h"
 #include "utils/containers/transform.h"
+#include "utils/containers/get_only.h"
 
 namespace FlexFlow {
 
@@ -109,6 +110,44 @@ DynamicNodeInvocation perform_bwd_pass_expansion_for_invocation(
       transform(invocation.inputs, to_grad),
   };
 }
+static std::unordered_set<DynamicNodeInvocation>
+    perform_pass_expansion_for_replicate(
+        DynamicNodeInvocation const &invocation) {
+
+  auto const &[input_slot, input] = get_only(invocation.inputs);
+  auto const &[output_slot, output] = get_only(invocation.outputs);
+
+  // forward: INPUT/FWD → OUTPUT/FWD (copy to replicas)
+  DynamicNodeInvocation fwd{
+      /*inputs=*/{{pass_expand_slot(input_slot, FwbTensorType::FORWARD),
+                   pass_expand_value(input, FwbTensorType::FORWARD)}},
+      /*node_attrs=*/
+      pass_expand_node(invocation.node_attrs, DynamicTaskType::FWD),
+      /*outputs=*/
+      {{pass_expand_slot(output_slot, FwbTensorType::FORWARD),
+        pass_expand_value(output, FwbTensorType::FORWARD)}},
+  };
+
+  // backward: OUTPUT/FWD + OUTPUT/GRAD → INPUT/GRAD (reduce gradients)
+  // The backward node needs the mapping from the output (replicated)
+  // so it knows which replicas to reduce from
+  DynamicNodeAttrs bwd_node_attrs = invocation.node_attrs;
+  bwd_node_attrs.task_type = DynamicTaskType::BWD;
+
+  DynamicNodeInvocation bwd{
+      /*inputs=*/{
+          {pass_expand_slot(output_slot, FwbTensorType::FORWARD),
+           pass_expand_value(output, FwbTensorType::FORWARD)},
+          {pass_expand_slot(output_slot, FwbTensorType::GRADIENT),
+           pass_expand_value(output, FwbTensorType::GRADIENT)},
+      },
+      /*node_attrs=*/bwd_node_attrs,
+      /*outputs=*/
+      {{pass_expand_slot(input_slot, FwbTensorType::GRADIENT),
+        pass_expand_value(input, FwbTensorType::GRADIENT)}},
+  };
+  return {fwd, bwd};
+}
 
 DynamicOpenDataflowGraph
     perform_pass_expansion(DynamicOpenDataflowGraph const &g) {
@@ -117,6 +156,10 @@ DynamicOpenDataflowGraph
 
   DynamicOpenDataflowGraph result = flatmap_dynamic_invocation_set(
       g, [](DynamicNodeInvocation const &invocation) {
+        if (invocation.node_attrs.op_attrs.has_value() &&
+            invocation.node_attrs.op_attrs.value().is_replicate()) {
+          return perform_pass_expansion_for_replicate(invocation);
+        }
         if (invocation.inputs.empty()) {
           return std::unordered_set{
               perform_fwd_pass_expansion_for_invocation(invocation),
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
index fb6efb96d0..f30a4d8470 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -39,7 +39,6 @@ bool graph_is_fully_shard_expanded(DynamicOpenDataflowGraph const &g) {
                                       value_is_shard_expanded,
                                       slot_is_shard_expanded);
 }
-
 static bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
     restrict_tensor_mapping_keys_to_coord(
         bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> const
@@ -85,6 +84,114 @@ static DynamicNodeInvocation shard_invocation_for_binding(
   };
 }
 
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_replicate(DynamicNodeInvocation const &i) {
+  auto const &[input_slot, input] = get_only(i.inputs);
+  auto const &[output_slot, output] = get_only(i.outputs);
+
+  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> input_mapping =
+      assert_unwrap(input.mapping);
+  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> output_mapping =
+      assert_unwrap(output.mapping);
+
+  return transform(output_mapping.left_values(),
+                   [&](ParallelTensorSpaceCoordinate const &p) {
+                     ParallelTensorSpaceCoordinate input_p{
+                         /*sum_component=*/p.sum_component,
+                         /*discard_copy_component=*/nonnegative_int{0},
+                         /*shard_components=*/p.shard_components,
+                     };
+                     return shard_invocation_for_binding(
+                         i,
+                         output_mapping.at_l(p),
+                         OperatorAtomicTaskShardBinding{{
+                             {input_slot.slot_name, input_p},
+                             {output_slot.slot_name, p},
+                         }});
+                   });
+}
+
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_replicate_bwd(DynamicNodeInvocation const &i) {
+
+  std::optional<DynamicValueAttrs> output_grad_opt;
+  std::optional<DynamicValueAttrs> output_fwd_opt;
+  std::optional<DynamicTensorSlot> output_grad_slot_opt;
+  std::optional<DynamicTensorSlot> output_fwd_slot_opt;
+
+  for (auto const &[slot, value] : i.inputs) {
+    if (slot.slot_tensor_role == DynamicTensorRole{FwbTensorType::GRADIENT}) {
+      output_grad_slot_opt = slot;
+      output_grad_opt = value;
+    } else {
+      output_fwd_slot_opt = slot;
+      output_fwd_opt = value;
+    }
+  }
+
+  DynamicValueAttrs output_grad = assert_unwrap(output_grad_opt);
+  DynamicValueAttrs output_fwd = assert_unwrap(output_fwd_opt);
+  DynamicTensorSlot output_grad_slot = assert_unwrap(output_grad_slot_opt);
+  DynamicTensorSlot output_fwd_slot = assert_unwrap(output_fwd_slot_opt);
+  auto const &[input_grad_slot, input_grad] = get_only(i.outputs);
+
+  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+      output_grad_mapping = assert_unwrap(output_grad.mapping);
+  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+      input_grad_mapping = assert_unwrap(input_grad.mapping);
+
+  std::unordered_map<FFOrdered<nonnegative_int>,
+                     std::unordered_set<ParallelTensorSpaceCoordinate>>
+      by_shard;
+  for (auto const &p : output_grad_mapping.left_values()) {
+    by_shard[p.shard_components].insert(p);
+  }
+
+  std::unordered_set<DynamicNodeInvocation> result;
+  for (auto const &[shard_components, replica_coords] : by_shard) {
+    ParallelTensorSpaceCoordinate src_p{
+        nonnegative_int{0}, nonnegative_int{0}, shard_components};
+    MachineSpaceCoordinate src_machine = input_grad_mapping.at_l(src_p);
+
+    bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+        replica_mapping;
+    for (auto const &p : replica_coords) {
+      replica_mapping.equate(p, output_grad_mapping.at_l(p));
+    }
+
+    DynamicValueAttrs sharded_output_grad = output_grad;
+    sharded_output_grad.mapping = replica_mapping;
+    sharded_output_grad.shard_coord = src_p;
+
+    DynamicValueAttrs sharded_output_fwd = output_fwd;
+    sharded_output_fwd.mapping = replica_mapping;
+    sharded_output_fwd.shard_coord = src_p;
+
+    DynamicValueAttrs sharded_input_grad = input_grad;
+    sharded_input_grad.mapping =
+        bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{
+            {src_p, src_machine}};
+    sharded_input_grad.shard_coord = src_p;
+
+    DynamicNodeAttrs sharded_node = i.node_attrs;
+    sharded_node.device_coord = src_machine;
+
+    result.insert(DynamicNodeInvocation{
+        /*inputs=*/{
+            {output_fwd_slot, sharded_output_fwd},
+            {output_grad_slot, sharded_output_grad},
+        },
+        /*node_attrs=*/sharded_node,
+        /*outputs=*/
+        {
+            {input_grad_slot, sharded_input_grad},
+        },
+    });
+  }
+  return result;
+}
+
+
 static std::unordered_set<DynamicNodeInvocation>
     perform_shard_expansion_for_copy(DynamicNodeInvocation const &i) {
   auto [input_slot, input] = get_only(i.inputs);
@@ -121,6 +228,22 @@ std::unordered_set<DynamicNodeInvocation>
     return perform_shard_expansion_for_copy(i);
   }
 
+  // forward replicate
+  if (i.node_attrs.op_attrs.has_value() &&
+      i.node_attrs.op_attrs.value().is_replicate() &&
+      i.node_attrs.task_type.has_value() &&
+      i.node_attrs.task_type.value() == DynamicTaskType::FWD) {
+    return perform_shard_expansion_for_replicate(i);
+  }
+
+  // backward replicate
+  if (i.node_attrs.op_attrs.has_value() &&
+      i.node_attrs.op_attrs.value().is_replicate() &&
+      i.node_attrs.task_type.has_value() &&
+      i.node_attrs.task_type.value() == DynamicTaskType::BWD) {
+    return perform_shard_expansion_for_replicate_bwd(i);
+  }
+
   MappedOperatorTaskGroup mapping = assert_unwrap(i.node_attrs.mapping);
 
   std::unordered_set<MachineSpaceCoordinate> shard_machine_coords =
diff --git a/lib/task-spec/src/task-spec/ops/impl/element_binary.cc b/lib/task-spec/src/task-spec/ops/impl/element_binary.cc
index 13465d7a5f..c8460af538 100644
--- a/lib/task-spec/src/task-spec/ops/impl/element_binary.cc
+++ b/lib/task-spec/src/task-spec/ops/impl/element_binary.cc
@@ -36,8 +36,8 @@ static std::optional<milliseconds_t>
     forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_profiling_settings();
   DeviceType kernel_device_type = acc.get_kernel_device_type();
-  ElementBinaryPerDeviceState per_device_state =
-      acc.get_per_device_op_state().require_element_binary().value();
+  std::optional<ElementBinaryPerDeviceState> per_device_state =
+      acc.get_per_device_op_state().require_element_binary();
   ElementBinaryAttrs attrs = acc.get_op_attrs().require_element_binary();
   device_handle_t handle = acc.get_ff_handle();
 
@@ -62,8 +62,8 @@ static std::optional<milliseconds_t>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_profiling_settings();
   DeviceType kernel_device_type = acc.get_kernel_device_type();
-  ElementBinaryPerDeviceState per_device_state =
-      acc.get_per_device_op_state().require_element_binary().value();
+  std::optional<ElementBinaryPerDeviceState> per_device_state =
+      acc.get_per_device_op_state().require_element_binary();
   ElementBinaryAttrs attrs = acc.get_op_attrs().require_element_binary();
   device_handle_t handle = acc.get_ff_handle();
 
diff --git a/lib/task-spec/src/task-spec/ops/impl/element_unary.cc b/lib/task-spec/src/task-spec/ops/impl/element_unary.cc
index d66ff9ab8d..9a092b90b8 100644
--- a/lib/task-spec/src/task-spec/ops/impl/element_unary.cc
+++ b/lib/task-spec/src/task-spec/ops/impl/element_unary.cc
@@ -35,8 +35,8 @@ static std::optional<milliseconds_t>
 
   ProfilingSettings profiling = acc.get_profiling_settings();
   DeviceType kernel_device_type = acc.get_kernel_device_type();
-  ElementUnaryPerDeviceState per_device_state =
-      acc.get_per_device_op_state().require_element_unary().value();
+  std::optional<ElementUnaryPerDeviceState> per_device_state =
+      acc.get_per_device_op_state().require_element_unary();
 
   return profile(forward_kernel,
                  profiling,
@@ -62,8 +62,8 @@ static std::optional<milliseconds_t>
 
   ProfilingSettings profiling = acc.get_profiling_settings();
   DeviceType kernel_device_type = acc.get_kernel_device_type();
-  ElementUnaryPerDeviceState per_device_state =
-      acc.get_per_device_op_state().require_element_unary().value();
+  std::optional<ElementUnaryPerDeviceState> per_device_state =
+      acc.get_per_device_op_state().require_element_unary();
 
   return profile(backward_kernel,
                  profiling,

From d033e22f77d08fc6b4d1151ef7d6bf7cc23281cb Mon Sep 17 00:00:00 2001
From: Seema Mirchandaney <seemah@sapling2.stanford.edu>
Date: Tue, 14 Apr 2026 17:10:12 -0700
Subject: [PATCH 3/6] remove ReplicateAttr

---
 .../src/realm-execution/pcg_instance.cc         | 17 ++++++-----------
 .../src/realm-execution/tasks/task_id_t.cc      | 12 +++---------
 .../training_operation_attrs.dtg.toml           |  4 ----
 .../task-spec/dynamic_graph/copy_insertion.cc   | 13 +++++--------
 ...namic_open_dataflow_graph_from_mapped_pcg.cc |  2 +-
 .../task-spec/dynamic_graph/pass_expansion.cc   | 10 +++++++---
 .../task-spec/dynamic_graph/shard_expansion.cc  | 16 +++++++++-------
 7 files changed, 31 insertions(+), 43 deletions(-)

diff --git a/lib/realm-execution/src/realm-execution/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance.cc
index a0653c3c37..17c62fe70c 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance.cc
@@ -264,23 +264,18 @@ static Realm::Event spawn_dynamic_node_invocation(
             [&](InputAttrs const &) { return Realm::Event::NO_EVENT; },
             [&](WeightAttrs const &) { return Realm::Event::NO_EVENT; },
             [&](ReplicateAttrs const &) {
-              // this should never be reached since replicate
-              // goes through TrainingOperationAttrs::ReplicateAttrs
-              PANIC("unexpected replicate in PCGOperatorAttrs path");
-              return Realm::Event::NO_EVENT;
+              if (invocation.node_attrs.task_type.has_value() &&
+                  invocation.node_attrs.task_type.value() ==
+                      DynamicTaskType::BWD) {
+                return issue_replicate_bwd();
+              }
+              return issue_copy(); // forward
             },
             [&](auto const &) { return spawn_task(); },
         });
       },
       [&](LossAttrs const &) { return spawn_task(); },
       [&](CopyAttrs const &) { return issue_copy(); },
-      [&](ReplicateAttrs const &) {
-        if (invocation.node_attrs.task_type.has_value() &&
-            invocation.node_attrs.task_type.value() == DynamicTaskType::BWD) {
-          return issue_replicate_bwd();
-        }
-        return issue_copy();
-      },
   });
 }
 
diff --git a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
index dd4b0a66ca..0bdc2ca6b5 100644
--- a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
@@ -64,9 +64,7 @@ std::optional<task_id_t>
       [](RepartitionAttrs const &attrs) {
         return task_id_t::REPARTITION_INIT_TASK_ID;
       },
-      [](ReplicateAttrs const &attrs) {
-        return task_id_t::REPLICATE_INIT_TASK_ID;
-      },
+      [](ReplicateAttrs const &attrs) { return std::nullopt; },
       [](ReshapeAttrs const &) { return std::nullopt; },
       [](ReverseAttrs const &) { return std::nullopt; },
       [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_INIT_TASK_ID; },
@@ -115,9 +113,7 @@ std::optional<task_id_t>
       [](RepartitionAttrs const &attrs) {
         return task_id_t::REPARTITION_FWD_TASK_ID;
       },
-      [](ReplicateAttrs const &attrs) {
-        return task_id_t::REPLICATE_FWD_TASK_ID;
-      },
+      [](ReplicateAttrs const &attrs) { return std::nullopt; },
       [](ReshapeAttrs const &) { return task_id_t::RESHAPE_FWD_TASK_ID; },
       [](ReverseAttrs const &) { return task_id_t::REVERSE_FWD_TASK_ID; },
       [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_FWD_TASK_ID; },
@@ -166,9 +162,7 @@ std::optional<task_id_t>
       [](RepartitionAttrs const &attrs) {
         return task_id_t::REPARTITION_BWD_TASK_ID;
       },
-      [](ReplicateAttrs const &attrs) {
-        return task_id_t::REPLICATE_BWD_TASK_ID;
-      },
+      [](ReplicateAttrs const &attrs) { return std::nullopt; },
       [](ReshapeAttrs const &) { return task_id_t::RESHAPE_BWD_TASK_ID; },
       [](ReverseAttrs const &) { return task_id_t::REVERSE_BWD_TASK_ID; },
       [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_BWD_TASK_ID; },
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
index 2bd0714512..8f8f6467c8 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
@@ -25,7 +25,3 @@ key = "loss"
 [[values]]
 type = "::FlexFlow::CopyAttrs"
 key = "copy"
-
-[[values]]
-type = "::FlexFlow::ReplicateAttrs"
-key = "replicate"
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
index 7a28e254aa..ef41042a51 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
@@ -26,14 +26,11 @@ bool node_is_copy(DynamicNodeAttrs const &n) {
 }
 
 static bool is_replicate_invocation(DynamicNodeInvocation const &i) {
-  if (!i.node_attrs.op_attrs.has_value()) {
-    return false;
-  }
-  TrainingOperationAttrs const &op_attrs = i.node_attrs.op_attrs.value();
-  if (op_attrs.is_replicate()) {
-    return true;
-  }
-  return false;
+  return i.node_attrs.op_attrs.has_value() &&
+         i.node_attrs.op_attrs.value().has<PCGOperatorAttrs>() &&
+         i.node_attrs.op_attrs.value()
+             .get<PCGOperatorAttrs>()
+             .has<ReplicateAttrs>();
 }
 
 bool value_is_mapped(DynamicValueAttrs const &n) {
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
index 3d48a0dc2b..a4ef156db9 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
@@ -116,7 +116,7 @@ static DynamicNodeInvocation
       /*task_type=*/std::nullopt,
       /*device_coord=*/std::nullopt,
       /*mapping=*/std::nullopt,
-      /*op_attrs=*/TrainingOperationAttrs{attrs.op_attrs.get<ReplicateAttrs>()},
+      /*op_attrs=*/TrainingOperationAttrs{attrs.op_attrs},
       /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
       /*per_device_op_state=*/std::nullopt,
   };
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
index aed5f2c4c3..faa1e186c3 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
@@ -2,9 +2,9 @@
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_role.h"
 #include "utils/containers/are_all_same.h"
+#include "utils/containers/get_only.h"
 #include "utils/containers/merge_disjoint_maps.h"
 #include "utils/containers/transform.h"
-#include "utils/containers/get_only.h"
 
 namespace FlexFlow {
 
@@ -30,6 +30,11 @@ bool graph_is_fully_pass_expanded(DynamicOpenDataflowGraph const &g) {
       g, node_is_pass_expanded, value_is_pass_expanded, slot_is_pass_expanded);
 }
 
+static bool is_replicate_attrs(DynamicNodeAttrs const &n) {
+  return n.op_attrs.has_value() && n.op_attrs.value().has<PCGOperatorAttrs>() &&
+         n.op_attrs.value().get<PCGOperatorAttrs>().has<ReplicateAttrs>();
+}
+
 DynamicTensorSlot pass_expand_slot(DynamicTensorSlot const &s,
                                    FwbTensorType tensor_type) {
   ASSERT(!slot_is_pass_expanded(s));
@@ -156,8 +161,7 @@ DynamicOpenDataflowGraph
 
   DynamicOpenDataflowGraph result = flatmap_dynamic_invocation_set(
       g, [](DynamicNodeInvocation const &invocation) {
-        if (invocation.node_attrs.op_attrs.has_value() &&
-            invocation.node_attrs.op_attrs.value().is_replicate()) {
+        if (is_replicate_attrs(invocation.node_attrs)) {
           return perform_pass_expansion_for_replicate(invocation);
         }
         if (invocation.inputs.empty()) {
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
index f30a4d8470..d3365ae44c 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -191,7 +191,6 @@ static std::unordered_set<DynamicNodeInvocation>
   return result;
 }
 
-
 static std::unordered_set<DynamicNodeInvocation>
     perform_shard_expansion_for_copy(DynamicNodeInvocation const &i) {
   auto [input_slot, input] = get_only(i.inputs);
@@ -228,18 +227,21 @@ std::unordered_set<DynamicNodeInvocation>
     return perform_shard_expansion_for_copy(i);
   }
 
+  bool const is_replicate =
+      i.node_attrs.op_attrs.has_value() &&
+      i.node_attrs.op_attrs.value().has<PCGOperatorAttrs>() &&
+      i.node_attrs.op_attrs.value()
+          .get<PCGOperatorAttrs>()
+          .has<ReplicateAttrs>();
+
   // forward replicate
-  if (i.node_attrs.op_attrs.has_value() &&
-      i.node_attrs.op_attrs.value().is_replicate() &&
-      i.node_attrs.task_type.has_value() &&
+  if (is_replicate && i.node_attrs.task_type.has_value() &&
       i.node_attrs.task_type.value() == DynamicTaskType::FWD) {
     return perform_shard_expansion_for_replicate(i);
   }
 
   // backward replicate
-  if (i.node_attrs.op_attrs.has_value() &&
-      i.node_attrs.op_attrs.value().is_replicate() &&
-      i.node_attrs.task_type.has_value() &&
+  if (is_replicate && i.node_attrs.task_type.has_value() &&
       i.node_attrs.task_type.value() == DynamicTaskType::BWD) {
     return perform_shard_expansion_for_replicate_bwd(i);
   }

From 6cd706091420f4e9c776d75dc3464bbf040f5385 Mon Sep 17 00:00:00 2001
From: Seema Mirchandaney <seemah@sapling2.stanford.edu>
Date: Wed, 15 Apr 2026 16:15:21 -0700
Subject: [PATCH 4/6] Add comments to realm reductions, Use existing graph
 methods

---
 .../realm-execution/tasks/realm_reduction.h   | 69 +++++++++++++++----
 ...mic_open_dataflow_graph_from_mapped_pcg.cc | 44 ++++++------
 2 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h b/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
index d9cf00441b..512e344824 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
@@ -1,23 +1,33 @@
-#pragma once
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_REDUCTION_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_REDUCTION_H
 #include "op-attrs/datatype.dtg.h"
 #include <realm.h>
 
 namespace FlexFlow {
 
-// Sum reduction for float
+/**
+ * \brief Realm Sum Reduction for Float
+ * \see https://legion.stanford.edu/tutorial/realm/reductions.html
+ */
 struct SumReductionFloat {
   using LHS = float;
   using RHS = float;
-  static constexpr RHS identity = 0.0f; // ← inside struct, constexpr
 
+  /** \brief Identity element for addition (0.0) */
+  static constexpr RHS identity = 0.0f;
+
+  /**
+   * \brief Apply reduction: lhs += rhs
+   * \tparam EXCLUSIVE If true, direct addition; if false, atomic CAS loop
+   * \param lhs Left-hand side accumulator (modified in place)
+   * \param rhs Value to add
+   */
   template <bool EXCLUSIVE>
   static void apply(LHS &lhs, RHS rhs) {
     if (EXCLUSIVE) {
       lhs += rhs;
     } else {
-      // atomic add for non-exclusive
-      __sync_fetch_and_add((int *)&lhs, *(int *)&rhs);
-      // proper float atomic add — use union trick
+      // Atomic float add via CAS loop
       union {
         float f;
         int i;
@@ -30,11 +40,18 @@ struct SumReductionFloat {
     }
   }
 
+  /**
+   * \brief Fold two RHS values: rhs1 += rhs2
+   * \tparam EXCLUSIVE If true, direct addition; if false, atomic CAS loop
+   * \param rhs1 Accumulator (modified in place)
+   * \param rhs2 Value to fold in
+   */
   template <bool EXCLUSIVE>
   static void fold(RHS &rhs1, RHS rhs2) {
     if (EXCLUSIVE) {
       rhs1 += rhs2;
     } else {
+      // Atomic float add via CAS loop
       union {
         float f;
         int i;
@@ -48,17 +65,29 @@ struct SumReductionFloat {
   }
 };
 
-// Sum reduction for double
+/**
+ * \brief Realm Sum Reduction for Double
+ * \see https://legion.stanford.edu/tutorial/realm/reductions.html
+ */
 struct SumReductionDouble {
   using LHS = double;
   using RHS = double;
-  static constexpr RHS identity = 0.0; // ← inside struct, constexpr
 
+  /** \brief Identity element for addition (0.0) */
+  static constexpr RHS identity = 0.0;
+
+  /**
+   * \brief Apply reduction: lhs += rhs
+   * \tparam EXCLUSIVE If true, direct addition; if false, atomic CAS loop
+   * \param lhs Left-hand side accumulator (modified in place)
+   * \param rhs Value to add
+   */
   template <bool EXCLUSIVE>
   static void apply(LHS &lhs, RHS rhs) {
     if (EXCLUSIVE) {
       lhs += rhs;
     } else {
+      // Atomic double add via CAS loop using long long reinterpretation
       union {
         double d;
         long long i;
@@ -71,11 +100,18 @@ struct SumReductionDouble {
     }
   }
 
+  /**
+   * \brief Fold two RHS values: rhs1 += rhs2
+   * \tparam EXCLUSIVE If true, direct addition; if false, atomic CAS loop
+   * \param rhs1 Accumulator (modified in place)
+   * \param rhs2 Value to fold in
+   */
   template <bool EXCLUSIVE>
   static void fold(RHS &rhs1, RHS rhs2) {
     if (EXCLUSIVE) {
       rhs1 += rhs2;
     } else {
+      // Atomic double add via CAS loop using long long reinterpretation
       union {
         double d;
         long long i;
@@ -89,12 +125,21 @@ struct SumReductionDouble {
   }
 };
 
-// Reduction op IDs — must not conflict with other registered redops
+/**
+ * \brief Reduction op IDs for sum reductions
+ * \warning These IDs must not conflict with other registered reduction ops
+ */
 enum SumReductionOpIDs {
-  REDOP_SUM_FLOAT = 1,
-  REDOP_SUM_DOUBLE = 2,
+  REDOP_SUM_FLOAT = 1,  ///< Sum reduction op ID for float
+  REDOP_SUM_DOUBLE = 2, ///< Sum reduction op ID for double
 };
 
+/**
+ * \brief Returns the Realm reduction op ID for a sum reduction over the given datatype
+ * \param dtype The datatype to look up
+ * \return The corresponding Realm::ReductionOpID
+ * \throws PANIC if no sum reduction is registered for the given datatype
+ */
 inline Realm::ReductionOpID get_sum_reduction_op_id(DataType dtype) {
   switch (dtype) {
     case DataType::FLOAT:
@@ -105,5 +150,5 @@ inline Realm::ReductionOpID get_sum_reduction_op_id(DataType dtype) {
       PANIC("no sum reduction registered for datatype {}", dtype);
   }
 }
-
 } // namespace FlexFlow
+#endif
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
index a4ef156db9..9349341d4b 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
@@ -2,6 +2,7 @@
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/pcg_operator_attrs.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_attrs.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
@@ -18,31 +19,30 @@ static bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
         MappedParallelComputationGraph const &mpcg,
         parallel_layer_guid_t const &replicate_layer) {
 
-  auto [input_slot_name, input_tensor_guid] =
-      get_only(get_incoming_tensors(mpcg.pcg, replicate_layer));
-
-  // find the layer that produces this tensor
-  for (auto const &[layer, _] : get_parallel_layer_attrs_mapping(mpcg.pcg)) {
-    for (auto const &[slot_name, t] : get_outgoing_tensors(mpcg.pcg, layer)) {
-      if (t == input_tensor_guid) {
-        MappedOperatorTaskGroup producer_mapping = mpcg.mapped_tasks.at(layer);
-        return get_tensor_bindings_for_slot_name(producer_mapping, slot_name);
-      }
-    }
-  }
+  // get_incoming_edges returns map<TensorSlotName, ParallelComputationGraphEdge>
+  // replicate has exactly one input
+  auto [input_slot_name, input_edge] =
+      get_only(get_incoming_edges(mpcg.pcg, replicate_layer));
 
-  PANIC("could not find producer of replicate layer input tensor");
+  parallel_layer_guid_t producer_layer = get_src_layer(input_edge);
+  TensorSlotName producer_slot = get_src_layer_output_slot_name(input_edge);
+
+  return get_tensor_bindings_for_slot_name(mpcg.mapped_tasks.at(producer_layer),
+                                           producer_slot);
 }
 
 static std::unordered_map<parallel_layer_guid_t, TensorSlotName>
     get_consumers_of_tensor(MappedParallelComputationGraph const &mpcg,
                             parallel_tensor_guid_t const &tensor) {
+  parallel_layer_guid_t producer_layer = get_source_layer(mpcg.pcg, tensor);
+
   std::unordered_map<parallel_layer_guid_t, TensorSlotName> result;
-  for (auto const &[layer, _] : get_parallel_layer_attrs_mapping(mpcg.pcg)) {
-    for (auto const &[slot_name, t] : get_incoming_tensors(mpcg.pcg, layer)) {
-      if (t == tensor) {
-        result.insert({layer, slot_name});
-      }
+  // get_outgoing_edges returns unordered_set<ParallelComputationGraphEdge>
+  for (ParallelComputationGraphEdge const &edge :
+       get_outgoing_edges(mpcg.pcg, producer_layer)) {
+    if (get_parallel_tensor(edge) == tensor) {
+      result.insert(
+          std::pair{get_dst_layer(edge), get_dst_layer_input_slot_name(edge)});
     }
   }
   return result;
@@ -76,7 +76,7 @@ static bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
 
 static DynamicNodeInvocation
     build_replicate_invocation(parallel_layer_guid_t const &layer,
-                               ParallelLayerAttrs const &attrs,
+                               ReplicateAttrs const &attrs,
                                MappedParallelComputationGraph const &mpcg) {
   auto [input_slot_name, input_tensor_guid] =
       get_only(get_incoming_tensors(mpcg.pcg, layer));
@@ -116,7 +116,7 @@ static DynamicNodeInvocation
       /*task_type=*/std::nullopt,
       /*device_coord=*/std::nullopt,
       /*mapping=*/std::nullopt,
-      /*op_attrs=*/TrainingOperationAttrs{attrs.op_attrs},
+      /*op_attrs=*/TrainingOperationAttrs{PCGOperatorAttrs{attrs}},
       /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
       /*per_device_op_state=*/std::nullopt,
   };
@@ -140,8 +140,8 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mapped_pcg(
 
     if (attrs.op_attrs.has<ReplicateAttrs>()) {
       // build replicate invocation
-      DynamicNodeInvocation repl_inv =
-          build_replicate_invocation(layer, attrs, mpcg);
+      DynamicNodeInvocation repl_inv = build_replicate_invocation(
+          layer, attrs.op_attrs.get<ReplicateAttrs>(), mpcg);
       result.invocations.emplace(repl_inv);
       continue;
     }

From 1cbcd8f3ab62f7134d5e581e04b4249fa817306e Mon Sep 17 00:00:00 2001
From: Seema Mirchandaney <seemah@sapling2.stanford.edu>
Date: Thu, 23 Apr 2026 14:25:03 -0700
Subject: [PATCH 5/6] Implement parallel operators (Replicate, Repartition,
 Combine, Reduction) in Realm backend (CPU versions)

Each parallel op is handled via Realm copies rather than op tasks:
- Replicate FWD: broadcast copy; BWD: sum-reduce replica gradients
- Repartition FWD: scatter into shards; BWD: gather shards into full tensor
- Combine FWD: gather shards into full tensor; BWD: scatter gradient into shards
- Reduction FWD: sum-reduce partials; BWD: broadcast gradient to all partials

Key implementation details:
- Parallel ops have no ComputationGraphOpAttrs equivalent
- Instance allocation uses offset index spaces for sharded tensors
- issue_copy uses actual instance index space via get_indexspace()
- Add CopyDomain::SRC/DST to select correct copy domain
- Combine FWD and Reduction FWD register only first invocation in ManyToOne
- Add get_per_device_shape() for correct per-device tensor size
- Add perform_shard_expansion_one_to_many and _many_to_one generic functions
- Add parallel_op_utils.h shared header for is_parallel_op_attrs
- Add CopyDomain enum and create_instance_with_offset to RealmContext
- Add multi-cpu tests for the parallel operators
---
 .../include/op-attrs/parallel_tensor_dims.h   |   2 +-
 .../include/op-attrs/parallel_tensor_shape.h  |   2 +
 .../src/op-attrs/parallel_tensor_dims.cc      |   8 +
 .../src/op-attrs/parallel_tensor_shape.cc     |   7 +
 .../test/src/op-attrs/ops/element_unary.cc    |   1 -
 .../include/realm-execution/realm_context.h   |  54 ++-
 .../realm-execution/instance_allocation.cc    |  49 ++-
 .../src/realm-execution/pcg_instance.cc       | 166 +++++++-
 .../src/realm-execution/realm_context.cc      | 128 +++++-
 .../src/realm-execution/tasks/task_id_t.cc    |  30 +-
 .../src/realm-execution/test_op_combine.cc    | 198 ++++++++++
 .../src/realm-execution/test_op_reduce.cc     | 288 ++++++++++++++
 .../realm-execution/test_op_repartition.cc    | 167 ++++++++
 .../dynamic_graph/parallel_op_utils.h         |  28 ++
 .../task-spec/dynamic_graph/copy_insertion.cc |  21 +-
 .../dynamic_open_dataflow_graph.cc            |   8 +
 ...mic_open_dataflow_graph_from_mapped_pcg.cc |  36 +-
 .../task-spec/dynamic_graph/pass_expansion.cc |  53 +--
 .../dynamic_graph/shard_expansion.cc          | 367 +++++++++++++++---
 19 files changed, 1447 insertions(+), 166 deletions(-)
 create mode 100644 lib/realm-execution/test/src/realm-execution/test_op_combine.cc
 create mode 100644 lib/realm-execution/test/src/realm-execution/test_op_reduce.cc
 create mode 100644 lib/realm-execution/test/src/realm-execution/test_op_repartition.cc
 create mode 100644 lib/task-spec/include/task-spec/dynamic_graph/parallel_op_utils.h

diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index 9e71785013..52a2371ed0 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -41,7 +41,7 @@ TensorDims get_piece_dims(ParallelTensorDims const &);
 TensorDims get_tensor_dims_unsafe(ParallelTensorDims const &);
 
 TensorDims get_reduced_dims(ParallelTensorDims const &);
-
+TensorDims get_per_device_dims(ParallelTensorDims const &dims);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index e23ae33cbf..93be4b230e 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -63,6 +63,8 @@ ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape,
 std::unordered_set<parallel_tensor_dim_idx_t>
     get_parallel_tensor_dim_indices(ParallelTensorShape const &shape);
 
+TensorShape get_per_device_shape(ParallelTensorShape const &s);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 71419e4a57..7798db0643 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -127,4 +127,12 @@ TensorDims get_reduced_dims(ParallelTensorDims const &dims) {
   return TensorDims{dim_sizes};
 }
 
+TensorDims get_per_device_dims(ParallelTensorDims const &dims) {
+  FFOrdered<positive_int> dim_sizes =
+      transform(dims.shard_dims, [](ShardParallelDim const &d) {
+        return positive_int{d.size.int_from_positive_int() /
+                            d.degree.int_from_positive_int()};
+      });
+  return TensorDims{dim_sizes};
+}
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
index 91d3d0b1aa..f4480e3239 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
@@ -150,4 +150,11 @@ std::unordered_set<parallel_tensor_dim_idx_t>
   return indices;
 }
 
+// actual per-device allocation size
+TensorShape get_per_device_shape(ParallelTensorShape const &s) {
+  return TensorShape{
+      get_per_device_dims(s.dims),
+      s.data_type,
+  };
+}
 } // namespace FlexFlow
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
index 43b4be06d8..00df1fc0b9 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
@@ -61,6 +61,5 @@ TEST_SUITE(FF_TEST_SUITE) {
           make_input(
               SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
     }
-
   }
 }
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
index eab42d0d79..6bb38a0824 100644
--- a/lib/realm-execution/include/realm-execution/realm_context.h
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -1,3 +1,4 @@
+
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
 
@@ -15,6 +16,11 @@
 
 namespace FlexFlow {
 
+enum class CopyDomain {
+  SRC, // use src instance index space as copy domain (default)
+  DST, // use dst instance index space as copy domain
+};
+
 /**
  * @brief An interface that wraps the rest of Realm and protects against certain
  * classes of bugs, such as shutdown bugs.
@@ -74,9 +80,9 @@ struct RealmContext {
                  Realm::Event wait_on = Realm::Event::NO_EVENT,
                  int priority = 0,
                  std::optional<Realm::ReductionOpID> redop_id = std::nullopt,
-                 bool exclusive = false);
+                 bool exclusive = false,
+                 CopyDomain domain = CopyDomain::SRC);
   ///\}
-
   /** \name Instance management */
   ///\{
   std::pair<Realm::RegionInstance, Realm::Event>
@@ -91,6 +97,50 @@ struct RealmContext {
    */
   Realm::Event get_outstanding_events();
 
+  /**
+ * \brief Create a Realm region instance with an offset index space.
+ *
+ * Similar to \ref create_instance, but allocates the instance with a
+ * non-zero origin rect. This is used for sharded tensors where each
+ * shard occupies a sub-region of the full logical tensor's index space.
+ *
+ * For example, given a tensor of shape [10, 16] split along dim 0
+ * with degree 2:
+ * - Shard 0 is allocated with rect [0..4, 0..15]
+ * - Shard 1 is allocated with rect [5..9, 0..15]
+ *
+ * This allows plain Realm copies between shards and the combined tensor
+ * to work correctly — points in each shard's index space match the
+ * corresponding points in the combined tensor's index space, so Realm
+ * copies data to the correct region without needing affine indirection.
+ *
+ * \param memory The Realm memory in which to allocate the instance.
+ * \param shape The per-device tensor shape (already divided by degree).
+ *              Determines the size of the instance.
+ * \param offsets Per-dimension offsets into the full logical tensor.
+ *                \p offsets[i] is the starting index along dimension i.
+ *                For shard k along dim d with piece_size p:
+ *                \p offsets[d] = k * p.
+ * \param prs Realm profiling request set.
+ * \param wait_on Event to wait on before creating the instance.
+ * \return A pair of the created \ref Realm::RegionInstance and a
+ *         \ref Realm::Event that fires when the instance is ready.
+ *
+ * \note The instance's index space has origin at \p offsets, not at
+ *       zero. Copies to/from this instance must use its actual index
+ *       space (via \c get_indexspace()) rather than a reconstructed
+ *       zero-based index space.
+ *
+ * \see create_instance
+ * \see perform_instance_allocation_for_value
+ */
+  std::pair<Realm::RegionInstance, Realm::Event> create_instance_with_offset(
+      Realm::Memory memory,
+      TensorShape const &shape,
+      std::vector<int> const &offsets,
+      Realm::ProfilingRequestSet const &prs,
+      Realm::Event wait_on = Realm::Event::NO_EVENT);
+
 protected:
   /**
    * \brief Compact **and clear** the outstanding event queue
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
index 4ef2919b10..740e044579 100644
--- a/lib/realm-execution/src/realm-execution/instance_allocation.cc
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -1,6 +1,9 @@
 #include "realm-execution/instance_allocation.h"
 #include "local-execution/tensor_allocation.h"
+#include "op-attrs/num_ptensor_shard_dims_t.dtg.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/relative_ff_dim_t.h"
+#include "op-attrs/shard_parallel_dim.dtg.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tensor_instance_backing.h"
@@ -17,10 +20,10 @@
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/values.h"
 #include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include "utils/optional.h"
 
 namespace FlexFlow {
-
 std::pair<Realm::RegionInstance, Realm::Event>
     perform_instance_allocation_for_value(
         MachineSpaceCoordinate const &device_coord,
@@ -28,11 +31,51 @@ std::pair<Realm::RegionInstance, Realm::Event>
         RealmContext &ctx) {
   ASSERT(value.accessor == std::nullopt);
 
-  TensorShape shape = get_piece_shape(value.parallel_tensor_shape.value());
+  ParallelTensorShape const par_shape = value.parallel_tensor_shape.value();
+
+  TensorShape shape = get_per_device_shape(par_shape);
 
   Realm::Processor proc = ctx.map_device_coord_to_processor(device_coord);
   Realm::Memory memory = ctx.get_nearest_memory(proc);
-  return ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
+
+  int ndims = static_cast<int>(num_shard_dims(par_shape).value);
+  std::vector<int> offsets(ndims, 0);
+
+  if (value.shard_coord.has_value()) {
+    ParallelTensorSpaceCoordinate const &coord = value.shard_coord.value();
+
+    for (int i = 0; i < ndims; i++) {
+      relative_ff_dim_t rel_dim{i};
+
+      // skip if shard_components doesn't have this dim
+      if (!coord.shard_components.idx_is_valid(rel_dim)) {
+        continue;
+      }
+
+      ShardParallelDim shard_dim = par_shape.dims.shard_dims.at(rel_dim);
+
+      // skip if not actually sharded
+      if (shard_dim.degree == 1_p) {
+        continue;
+      }
+
+      nonnegative_int piece_size =
+          shard_dim.size.nonnegative_int_from_positive_int() /
+          shard_dim.degree.nonnegative_int_from_positive_int();
+      nonnegative_int shard_idx = coord.shard_components.at(rel_dim);
+      offsets[i] = static_cast<int>(shard_idx * piece_size);
+    }
+  }
+
+  bool has_offset =
+      std::any_of(offsets.begin(), offsets.end(), [](int o) { return o != 0; });
+
+  if (has_offset) {
+    return ctx.create_instance_with_offset(
+        memory, shape, offsets, Realm::ProfilingRequestSet());
+  } else {
+    return ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
+  }
 }
 
 TensorInstanceBacking perform_instance_allocation(
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance.cc
index 17c62fe70c..06823ad089 100644
--- a/lib/realm-execution/src/realm-execution/pcg_instance.cc
+++ b/lib/realm-execution/src/realm-execution/pcg_instance.cc
@@ -1,4 +1,5 @@
 #include "realm-execution/pcg_instance.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_slot_name.dtg.h"
 #include "pcg/optimizer_attrs.h"
 #include "realm-execution/dependency_set.h"
@@ -78,6 +79,10 @@ std::optional<Realm::RegionInstance>
   return this->logit_grad_tensor;
 }
 
+static bool has_task_type(DynamicNodeAttrs const &n, DynamicTaskType t) {
+  return n.task_type.has_value() && n.task_type.value() == t;
+}
+
 PCGInstance create_pcg_instance(
     RealmContext &ctx,
     MappedParallelComputationGraph const &mpcg,
@@ -216,7 +221,29 @@ static Realm::Event spawn_dynamic_node_invocation(
                           precondition);
   };
 
-  // issue_replicate_bwd lambda
+  auto issue_sum_reduction_copy =
+      [&](DynamicValueAttrs const &input,
+          DynamicValueAttrs const &output) -> Realm::Event {
+    Realm::RegionInstance src_inst =
+        tensor_instance_backing.backing.at(input).first;
+    Realm::RegionInstance dst_inst =
+        tensor_instance_backing.backing.at(output).first;
+
+    Realm::ReductionOpID redop_id = get_sum_reduction_op_id(
+        assert_unwrap(input.parallel_tensor_shape).data_type);
+
+    return ctx.issue_copy(assert_unwrap(input.parallel_tensor_shape),
+                          src_inst,
+                          assert_unwrap(output.parallel_tensor_shape),
+                          dst_inst,
+                          Realm::ProfilingRequestSet{},
+                          precondition,
+                          /*priority=*/0,
+                          /*redop_id=*/redop_id,
+                          /*exclusive=*/false);
+  };
+
+  // replicate backward — find GRADIENT slot, chain reductions sequentially
   auto issue_replicate_bwd = [&]() {
     std::optional<DynamicValueAttrs> output_grad_opt;
     for (auto const &[slot, value] : invocation.inputs) {
@@ -226,32 +253,112 @@ static Realm::Event spawn_dynamic_node_invocation(
     }
     DynamicValueAttrs output_grad = assert_unwrap(output_grad_opt);
     DynamicValueAttrs input_grad = get_only(invocation.outputs).second;
-    Realm::RegionInstance dst_inst =
-        tensor_instance_backing.backing.at(input_grad).first;
 
-    Realm::ReductionOpID redop_id = get_sum_reduction_op_id(
-        assert_unwrap(output_grad.parallel_tensor_shape).data_type);
-
-    // chain reductions sequentially to avoid write races on dst
+    // chain sequentially to avoid write races
     Realm::Event e = precondition;
     for (auto const &[p, m] : assert_unwrap(output_grad.mapping)) {
       DynamicValueAttrs replica_key = output_grad;
       replica_key.mapping =
           bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{{p, m}};
       replica_key.shard_coord = p;
+      e = issue_sum_reduction_copy(replica_key, input_grad);
+    }
+    return e;
+  };
+
+  auto issue_reduction_fwd = [&]() {
+    DynamicValueAttrs const &output = get_only(invocation.outputs).second;
+    Realm::RegionInstance dst_inst =
+        tensor_instance_backing.backing.at(output).first;
+
+    Realm::ReductionOpID redop_id = get_sum_reduction_op_id(
+        assert_unwrap(output.parallel_tensor_shape).data_type);
+
+    // chain reductions sequentially
+    Realm::Event e = precondition;
+    for (auto const &[slot, input] : invocation.inputs) {
+      Realm::RegionInstance src_inst =
+          tensor_instance_backing.backing.at(input).first;
+      e = ctx.issue_copy(assert_unwrap(input.parallel_tensor_shape),
+                         src_inst,
+                         assert_unwrap(output.parallel_tensor_shape),
+                         dst_inst,
+                         Realm::ProfilingRequestSet{},
+                         e,
+                         /*priority=*/0,
+                         /*redop_id=*/redop_id,
+                         /*exclusive=*/false);
+    }
+    return e;
+  };
+  auto issue_combine_fwd = [&]() {
+    DynamicValueAttrs const &output = get_only(invocation.outputs).second;
+    Realm::RegionInstance dst_inst =
+        tensor_instance_backing.backing.at(output).first;
+
+    // chain copies sequentially — each input shard copies into the output
+    Realm::Event e = precondition;
+    for (auto const &[slot, input] : invocation.inputs) {
+      Realm::RegionInstance src_inst =
+          tensor_instance_backing.backing.at(input).first;
+      e = ctx.issue_copy(assert_unwrap(input.parallel_tensor_shape),
+                         src_inst,
+                         assert_unwrap(output.parallel_tensor_shape),
+                         dst_inst,
+                         Realm::ProfilingRequestSet{},
+                         e);
+    }
+    return e;
+  };
+
+  auto issue_parallel_op_bwd_copy = [&]() {
+    // find single GRADIENT input
+    std::optional<DynamicValueAttrs> grad_input_opt;
+    for (auto const &[slot, value] : invocation.inputs) {
+      if (slot.slot_tensor_role == DynamicTensorRole{FwbTensorType::GRADIENT}) {
+        grad_input_opt = value;
+      }
+    }
+
+    // determine copy domain based on op type
+    PCGOperatorAttrs pcg =
+        invocation.node_attrs.op_attrs.value().get<PCGOperatorAttrs>();
+    CopyDomain domain = CopyDomain::SRC;
+    // reduction BWD: same size → use SRC domain
+    if (pcg.has<RepartitionAttrs>()) {
+      // repartition BWD: src=small shard, dst=full → use SRC domain
+      domain = CopyDomain::SRC;
+    } else if (pcg.has<CombineAttrs>()) {
+      // combine BWD: src=full, dst=small shard → use DST domain
+      domain = CopyDomain::DST;
+    }
+    DynamicValueAttrs grad_input = assert_unwrap(grad_input_opt);
+    DynamicValueAttrs output = get_only(invocation.outputs).second;
+    Realm::RegionInstance dst_inst =
+        tensor_instance_backing.backing.at(output).first;
+
+    // iterate over all source coords in grad mapping
+    // chain copies sequentially into the same destination
+    Realm::Event e = precondition;
+    for (auto const &[p, m] : assert_unwrap(grad_input.mapping)) {
+      DynamicValueAttrs shard_key = grad_input;
+      shard_key.mapping =
+          bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{{p, m}};
+      shard_key.shard_coord = p;
 
       Realm::RegionInstance src_inst =
-          tensor_instance_backing.backing.at(replica_key).first;
+          tensor_instance_backing.backing.at(shard_key).first;
 
-      e = ctx.issue_copy(assert_unwrap(output_grad.parallel_tensor_shape),
+      e = ctx.issue_copy(assert_unwrap(grad_input.parallel_tensor_shape),
                          src_inst,
-                         assert_unwrap(input_grad.parallel_tensor_shape),
+                         assert_unwrap(output.parallel_tensor_shape),
                          dst_inst,
                          Realm::ProfilingRequestSet{},
                          e,
-                         0,
-                         redop_id,
-                         false);
+                         /*priority=*/0,
+                         /*redop_id=*/std::nullopt,
+                         /*exclusive=*/false,
+                         /*domain=*/domain);
     }
     return e;
   };
@@ -271,6 +378,39 @@ static Realm::Event spawn_dynamic_node_invocation(
               }
               return issue_copy(); // forward
             },
+            [&](RepartitionAttrs const &) {
+              if (has_task_type(invocation.node_attrs, DynamicTaskType::BWD)) {
+                return issue_parallel_op_bwd_copy(); // point-to-point copy after shard expansion
+              }
+              // FWD: src=[0..9], dst=[0..4] or [5..9] — use DST domain
+              DynamicValueAttrs const &input =
+                  get_only(invocation.inputs).second;
+              DynamicValueAttrs const &output =
+                  get_only(invocation.outputs).second;
+              return ctx.issue_copy(
+                  assert_unwrap(input.parallel_tensor_shape),
+                  tensor_instance_backing.backing.at(input).first,
+                  assert_unwrap(output.parallel_tensor_shape),
+                  tensor_instance_backing.backing.at(output).first,
+                  Realm::ProfilingRequestSet{},
+                  precondition,
+                  /*priority=*/0,
+                  /*redop_id=*/std::nullopt,
+                  /*exclusive=*/false,
+                  /*domain=*/CopyDomain::DST); // ← use dst index space
+            },
+            [&](CombineAttrs const &) {
+              if (has_task_type(invocation.node_attrs, DynamicTaskType::BWD)) {
+                return issue_parallel_op_bwd_copy(); // point-to-point copy after shard expansion
+              }
+              return issue_combine_fwd(); // forward
+            },
+            [&](ReductionAttrs const &) {
+              if (has_task_type(invocation.node_attrs, DynamicTaskType::BWD)) {
+                return issue_parallel_op_bwd_copy(); // broadcast copy after shard expansion
+              }
+              return issue_reduction_fwd(); // forward needs sum reduction
+            },
             [&](auto const &) { return spawn_task(); },
         });
       },
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
index a4669bf43e..98ec711310 100644
--- a/lib/realm-execution/src/realm-execution/realm_context.cc
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -15,8 +15,28 @@
 #include "utils/nonnegative_int/nonnegative_int.h"
 #include "utils/one_to_many/one_to_many.h"
 #include "utils/positive_int/positive_int.h"
+#include <realm/indexspace.h>
 
 namespace FlexFlow {
+template <int N, typename T = int>
+static Realm::Rect<N, T>
+    rect_from_dims_with_offset(TensorDims const &dims,
+                               std::vector<int> const &offsets) {
+  std::vector<int> values;
+  for (positive_int const &v : dims.ff_ordered) {
+    values.push_back(v.int_from_positive_int());
+  }
+  ASSERT((int)values.size() == N);
+  ASSERT((int)offsets.size() == N);
+
+  std::vector<int> lo(N), hi(N);
+  for (int i = 0; i < N; i++) {
+    lo[i] = offsets[i];
+    hi[i] = offsets[i] + values[i] - 1;
+  }
+  return Realm::Rect<N, T>{Realm::Point<N, T>{lo.data()},
+                           Realm::Point<N, T>{hi.data()}};
+}
 
 RealmContext::RealmContext(Realm::Processor processor)
     : processor(processor),
@@ -163,7 +183,8 @@ Realm::Event
                              Realm::Event wait_on,
                              int priority,
                              std::optional<Realm::ReductionOpID> redop_id,
-                             bool exclusive) {
+                             bool exclusive,
+                             CopyDomain domain) {
   TensorShape src_piece_shape = get_piece_shape(src_shape);
   TensorShape dst_piece_shape = get_piece_shape(dst_shape);
   ASSERT(src_piece_shape == dst_piece_shape); // For now, assume they match
@@ -190,36 +211,40 @@ Realm::Event
     dst_field.set_redop(redop_id.value(), /*is_fold=*/false, exclusive);
   }
 
+  // select which instance's index space to use as copy domain
+  Realm::RegionInstance const domain_inst =
+      (domain == CopyDomain::DST) ? dst_inst : src_inst;
+
   Realm::Event result;
   switch (src_piece_shape.dims.ff_ordered.num_dims()) {
 #if REALM_MAX_DIM >= 1
     case 1:
-      result = ispace_from_dims<1>(src_piece_shape.dims)
-                   .copy({src_field}, {dst_field}, requests, wait_on, priority);
+      result = domain_inst.get_indexspace<1, int>().copy(
+          {src_field}, {dst_field}, requests, wait_on, priority);
       break;
 #endif
 #if REALM_MAX_DIM >= 2
     case 2:
-      result = ispace_from_dims<2>(src_piece_shape.dims)
-                   .copy({src_field}, {dst_field}, requests, wait_on, priority);
+      result = domain_inst.get_indexspace<2, int>().copy(
+          {src_field}, {dst_field}, requests, wait_on, priority);
       break;
 #endif
 #if REALM_MAX_DIM >= 3
     case 3:
-      result = ispace_from_dims<3>(src_piece_shape.dims)
-                   .copy({src_field}, {dst_field}, requests, wait_on, priority);
+      result = domain_inst.get_indexspace<3, int>().copy(
+          {src_field}, {dst_field}, requests, wait_on, priority);
       break;
 #endif
 #if REALM_MAX_DIM >= 4
     case 4:
-      result = ispace_from_dims<4>(src_piece_shape.dims)
-                   .copy({src_field}, {dst_field}, requests, wait_on, priority);
+      result = domain_inst.get_indexspace<4, int>().copy(
+          {src_field}, {dst_field}, requests, wait_on, priority);
       break;
 #endif
 #if REALM_MAX_DIM >= 5
     case 5:
-      result = ispace_from_dims<5>(src_piece_shape.dims)
-                   .copy({src_field}, {dst_field}, requests, wait_on, priority);
+      result = domain_inst.get_indexspace<5, int>().copy(
+          {src_field}, {dst_field}, requests, wait_on, priority);
       break;
 #endif
     default:
@@ -230,7 +255,6 @@ Realm::Event
   this->outstanding_events.push_back(result);
   return result;
 }
-
 std::pair<Realm::RegionInstance, Realm::Event>
     RealmContext::create_instance(Realm::Memory memory,
                                   TensorShape const &shape,
@@ -310,6 +334,86 @@ std::pair<Realm::RegionInstance, Realm::Event>
   return std::pair{inst, ready};
 }
 
+std::pair<Realm::RegionInstance, Realm::Event>
+    RealmContext::create_instance_with_offset(
+        Realm::Memory memory,
+        TensorShape const &shape,
+        std::vector<int> const &offsets,
+        Realm::ProfilingRequestSet const &prs,
+        Realm::Event wait_on) {
+  std::vector<size_t> field_sizes{static_cast<size_t>(
+      size_of_datatype(shape.data_type).int_from_positive_int())};
+  Realm::RegionInstance inst;
+  Realm::Event ready;
+  switch (shape.dims.ff_ordered.num_dims()) {
+#if REALM_MAX_DIM >= 1
+    case 1:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          rect_from_dims_with_offset<1>(shape.dims, offsets),
+          field_sizes,
+          0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 2
+    case 2:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          rect_from_dims_with_offset<2>(shape.dims, offsets),
+          field_sizes,
+          0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 3
+    case 3:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          rect_from_dims_with_offset<3>(shape.dims, offsets),
+          field_sizes,
+          0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 4
+    case 4:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          rect_from_dims_with_offset<4>(shape.dims, offsets),
+          field_sizes,
+          0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 5
+    case 5:
+      ready = Realm::RegionInstance::create_instance(
+          inst,
+          memory,
+          rect_from_dims_with_offset<5>(shape.dims, offsets),
+          field_sizes,
+          0 /*SOA*/,
+          prs,
+          wait_on);
+      break;
+#endif
+    default:
+      PANIC("TensorShape dims greater than REALM_MAX_DIM: {}",
+            shape.dims.ff_ordered.num_dims());
+  }
+  this->outstanding_events.push_back(ready);
+  return {inst, ready};
+}
+
 Realm::Event RealmContext::get_outstanding_events() {
   Realm::Event result = this->merge_outstanding_events();
   this->outstanding_events.push_back(result);
diff --git a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
index 0bdc2ca6b5..e55eebaabd 100644
--- a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
@@ -36,7 +36,7 @@ std::optional<task_id_t>
       [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_INIT_TASK_ID; },
       [](BroadcastAttrs const &) { return std::nullopt; },
       [](CastAttrs const &) { return std::nullopt; },
-      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_INIT_TASK_ID; },
+      [](CombineAttrs const &attrs) { return std::nullopt; },
       [](ConcatAttrs const &) { return std::nullopt; },
       [](Conv2DAttrs const &) { return task_id_t::CONV2D_INIT_TASK_ID; },
       [](DropoutAttrs const &) { return task_id_t::DROPOUT_INIT_TASK_ID; },
@@ -58,12 +58,8 @@ std::optional<task_id_t>
       [](NoopAttrs const &) { return std::nullopt; },
       [](Pool2DAttrs const &) { return task_id_t::POOL2D_INIT_TASK_ID; },
       [](ReduceAttrs const &) { return task_id_t::REDUCE_INIT_TASK_ID; },
-      [](ReductionAttrs const &attrs) {
-        return task_id_t::REDUCTION_INIT_TASK_ID;
-      },
-      [](RepartitionAttrs const &attrs) {
-        return task_id_t::REPARTITION_INIT_TASK_ID;
-      },
+      [](ReductionAttrs const &attrs) { return std::nullopt; },
+      [](RepartitionAttrs const &attrs) { return std::nullopt; },
       [](ReplicateAttrs const &attrs) { return std::nullopt; },
       [](ReshapeAttrs const &) { return std::nullopt; },
       [](ReverseAttrs const &) { return std::nullopt; },
@@ -85,7 +81,7 @@ std::optional<task_id_t>
       [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_FWD_TASK_ID; },
       [](BroadcastAttrs const &) { return task_id_t::BROADCAST_FWD_TASK_ID; },
       [](CastAttrs const &) { return task_id_t::CAST_FWD_TASK_ID; },
-      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_FWD_TASK_ID; },
+      [](CombineAttrs const &attrs) { return std::nullopt; },
       [](ConcatAttrs const &) { return task_id_t::CONCAT_FWD_TASK_ID; },
       [](Conv2DAttrs const &) { return task_id_t::CONV2D_FWD_TASK_ID; },
       [](DropoutAttrs const &) { return task_id_t::DROPOUT_FWD_TASK_ID; },
@@ -107,12 +103,8 @@ std::optional<task_id_t>
       [](NoopAttrs const &) { return std::nullopt; },
       [](Pool2DAttrs const &) { return task_id_t::POOL2D_FWD_TASK_ID; },
       [](ReduceAttrs const &) { return task_id_t::REDUCE_FWD_TASK_ID; },
-      [](ReductionAttrs const &attrs) {
-        return task_id_t::REDUCTION_FWD_TASK_ID;
-      },
-      [](RepartitionAttrs const &attrs) {
-        return task_id_t::REPARTITION_FWD_TASK_ID;
-      },
+      [](ReductionAttrs const &attrs) { return std::nullopt; },
+      [](RepartitionAttrs const &attrs) { return std::nullopt; },
       [](ReplicateAttrs const &attrs) { return std::nullopt; },
       [](ReshapeAttrs const &) { return task_id_t::RESHAPE_FWD_TASK_ID; },
       [](ReverseAttrs const &) { return task_id_t::REVERSE_FWD_TASK_ID; },
@@ -134,7 +126,7 @@ std::optional<task_id_t>
       [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_BWD_TASK_ID; },
       [](BroadcastAttrs const &) { return task_id_t::BROADCAST_BWD_TASK_ID; },
       [](CastAttrs const &) { return task_id_t::CAST_BWD_TASK_ID; },
-      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_BWD_TASK_ID; },
+      [](CombineAttrs const &attrs) { return std::nullopt; },
       [](ConcatAttrs const &) { return task_id_t::CONCAT_BWD_TASK_ID; },
       [](Conv2DAttrs const &) { return task_id_t::CONV2D_BWD_TASK_ID; },
       [](DropoutAttrs const &) { return task_id_t::DROPOUT_BWD_TASK_ID; },
@@ -156,12 +148,8 @@ std::optional<task_id_t>
       [](NoopAttrs const &) { return std::nullopt; },
       [](Pool2DAttrs const &) { return task_id_t::POOL2D_BWD_TASK_ID; },
       [](ReduceAttrs const &) { return task_id_t::REDUCE_BWD_TASK_ID; },
-      [](ReductionAttrs const &attrs) {
-        return task_id_t::REDUCTION_BWD_TASK_ID;
-      },
-      [](RepartitionAttrs const &attrs) {
-        return task_id_t::REPARTITION_BWD_TASK_ID;
-      },
+      [](ReductionAttrs const &attrs) { return std::nullopt; },
+      [](RepartitionAttrs const &attrs) { return std::nullopt; },
       [](ReplicateAttrs const &attrs) { return std::nullopt; },
       [](ReshapeAttrs const &) { return task_id_t::RESHAPE_BWD_TASK_ID; },
       [](ReverseAttrs const &) { return task_id_t::REVERSE_BWD_TASK_ID; },
diff --git a/lib/realm-execution/test/src/realm-execution/test_op_combine.cc b/lib/realm-execution/test/src/realm-execution/test_op_combine.cc
new file mode 100644
index 0000000000..1eb953fe93
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/test_op_combine.cc
@@ -0,0 +1,198 @@
+#include "internal/realm_test_utils.h"
+#include "kernels/allocation.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "op-attrs/operator_task_space_to_operator_task_space_mapping.h"
+#include "op-attrs/ops/combine.h"
+#include "op-attrs/ops/element_unary.h"
+#include "op-attrs/ops/linear.h"
+#include "op-attrs/ops/repartition.h"
+#include "op-attrs/ops/replicate.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/pcg_instance.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/realm_manager.h"
+#include "task-spec/permissions.h"
+#include "test/utils/doctest/check_kv.h"
+#include "utils/containers/require_only_key.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+template <typename T>
+static ParallelLayerAttrs make_layer_attrs(T const &op_attrs) {
+  return ParallelLayerAttrs{
+      /*op_attrs=*/PCGOperatorAttrs{op_attrs},
+      /*name=*/std::nullopt,
+  };
+};
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training Combine Op (CPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+    ControllerTaskResult result =
+        manager.start_controller([](RealmContext &ctx) {
+          Allocator allocator = ctx.get_current_device_allocator();
+
+          positive_int batch_size = 10_p;
+          positive_int data_dim = 16_p;
+
+          TensorShape input_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+          ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+          // input layer
+          ParallelLayerAddedResult inputs_layer =
+              pcg_add_input_layer(pcg, input_tensor_shape);
+          parallel_tensor_guid_t t_input =
+              require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+          // repartition along dim 0 with degree 2
+          // needed so combine has a degree=2 sharded tensor to combine
+          RepartitionAttrs repartition_attrs{
+              /*repartition_dim=*/ff_dim_t{nonnegative_int{0}},
+              /*repartition_degree=*/2_p,
+          };
+          ParallelLayerAddedResult repartition_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(repartition_attrs),
+                                 {{TensorSlotName::INPUT, t_input}},
+                                 /*weights=*/{});
+          parallel_tensor_guid_t t_repartitioned = require_only_key(
+              repartition_operator.outputs, TensorSlotName::OUTPUT);
+
+          // combine along dim 0 with degree 2
+          CombineAttrs combine_attrs{
+              /*combine_dim=*/ff_dim_t{nonnegative_int{0}},
+              /*combine_degree=*/2_p,
+          };
+          ParallelLayerAddedResult combine_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(combine_attrs),
+                                 {{TensorSlotName::INPUT, t_repartitioned}},
+                                 /*weights=*/{});
+          parallel_tensor_guid_t t_combined = require_only_key(
+              combine_operator.outputs, TensorSlotName::OUTPUT);
+
+          // relu consumer
+          ParallelLayerAddedResult relu_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(make_relu_attrs()),
+                                 {{TensorSlotName::INPUT, t_combined}},
+                                 /*weights=*/{});
+
+          MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+          MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
+
+          // input: one shard on cpu0 (not yet repartitioned)
+          ParallelTensorSpaceCoordinate tensor_coord0{
+              0_n, 0_n, FFOrdered{0_n, 0_n}};
+          // after repartition: two shards along dim 0
+          ParallelTensorSpaceCoordinate tensor_coord_shard0{
+              0_n, 0_n, FFOrdered{0_n, 0_n}};
+          ParallelTensorSpaceCoordinate tensor_coord_shard1{
+              0_n, 0_n, FFOrdered{1_n, 0_n}};
+          // after combine: one shard on cpu0
+          ParallelTensorSpaceCoordinate tensor_coord_combined{
+              0_n, 0_n, FFOrdered{0_n, 0_n}};
+
+          MappedParallelComputationGraph mpcg{
+              pcg,
+              {
+                  // input: one shard on cpu0
+                  {inputs_layer.parallel_layer,
+                   MappedOperatorTaskGroup{
+                       {{cpu0,
+                         OperatorAtomicTaskShardBinding{
+                             {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+                  // repartition: OUTPUT only — no INPUT since all replicas
+                  // read same source coord violating bidict uniqueness
+                  {repartition_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {cpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::OUTPUT, tensor_coord_shard0},
+                        }}},
+                       {cpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::OUTPUT, tensor_coord_shard1},
+                        }}},
+                   }}},
+                  // combine: two inputs → one output on cpu0
+                  {combine_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {cpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_shard0},
+                        }}},
+                       {cpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_shard1},
+                        }}},
+                   }}},
+                  // relu: one shard on cpu0
+                  {relu_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {cpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_combined},
+                            {TensorSlotName::OUTPUT, tensor_coord_combined},
+                        }}},
+                   }}},
+              }};
+
+          OptimizerAttrs optimizer_attrs =
+              OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                               /*momentum=*/0.9,
+                                               /*nesterov=*/false,
+                                               /*weight_decay=*/0.001}};
+
+          std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+              input_tensors;
+
+          DistributedFfHandle device_handle = create_distributed_ff_handle(
+              ctx,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+
+          PCGInstance pcg_instance =
+              create_pcg_instance(ctx,
+                                  mpcg,
+                                  optimizer_attrs,
+                                  std::nullopt,
+                                  input_tensors,
+                                  ProfilingSettings{0, 0},
+                                  device_handle,
+                                  FFIterationConfig{1_p});
+
+          perform_all_passes_for_pcg_instance(pcg_instance,
+                                              ProfilingSettings{0, 0},
+                                              device_handle,
+                                              FFIterationConfig{1_p});
+        });
+    result.wait();
+  }
+}
+} // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/test_op_reduce.cc b/lib/realm-execution/test/src/realm-execution/test_op_reduce.cc
new file mode 100644
index 0000000000..9648f68898
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/test_op_reduce.cc
@@ -0,0 +1,288 @@
+#include "internal/realm_test_utils.h"
+#include "kernels/allocation.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "op-attrs/operator_task_space_to_operator_task_space_mapping.h"
+#include "op-attrs/ops/element_unary.h"
+#include "op-attrs/ops/linear.h"
+#include "op-attrs/ops/reduction.h"
+#include "op-attrs/ops/repartition.h"
+#include "op-attrs/ops/replicate.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/pcg_instance.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/realm_manager.h"
+#include "task-spec/permissions.h"
+#include "test/utils/doctest/check_kv.h"
+#include "utils/containers/require_only_key.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+template <typename T>
+static ParallelLayerAttrs make_layer_attrs(T const &op_attrs) {
+  return ParallelLayerAttrs{
+      /*op_attrs=*/PCGOperatorAttrs{op_attrs},
+      /*name=*/std::nullopt,
+  };
+};
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training Reduction Op (CPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+    ControllerTaskResult result = manager.start_controller([](RealmContext
+                                                                  &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      positive_int batch_size = 4_p;
+      positive_int in_channels = 8_p;
+      positive_int out_channels = 4_p;
+
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, in_channels}}, DataType::FLOAT};
+
+      TensorShape weight_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{out_channels, in_channels}}, DataType::FLOAT};
+
+      ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+      // input layer
+      ParallelLayerAddedResult inputs_layer =
+          pcg_add_input_layer(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input =
+          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+      // weight layer
+      ParallelLayerAddedResult weights_layer =
+          pcg_add_input_layer(pcg, weight_tensor_shape);
+      parallel_tensor_guid_t t_weight =
+          require_only_key(weights_layer.outputs, TensorSlotName::OUTPUT);
+
+      // repartition input along feature dim (dim 1) with degree 2
+      RepartitionAttrs input_repartition_attrs{
+          /*repartition_dim=*/ff_dim_t{nonnegative_int{1}},
+          /*repartition_degree=*/2_p,
+      };
+      ParallelLayerAddedResult input_repartition_operator =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(input_repartition_attrs),
+                             {{TensorSlotName::INPUT, t_input}},
+                             /*weights=*/{});
+      parallel_tensor_guid_t t_input_repartitioned = require_only_key(
+          input_repartition_operator.outputs, TensorSlotName::OUTPUT);
+
+      // repartition weight along feature dim (dim 1) with degree 2
+      // to match the repartitioned input
+      RepartitionAttrs weight_repartition_attrs{
+          /*repartition_dim=*/ff_dim_t{nonnegative_int{1}},
+          /*repartition_degree=*/2_p,
+      };
+      ParallelLayerAddedResult weight_repartition_operator =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(weight_repartition_attrs),
+                             {{TensorSlotName::INPUT, t_weight}},
+                             /*weights=*/{});
+      parallel_tensor_guid_t t_weight_repartitioned = require_only_key(
+          weight_repartition_operator.outputs, TensorSlotName::OUTPUT);
+
+      // linear with repartitioned input and weight
+      // shard_dim[-1]=2 → sum_degree=2 output
+      ParallelLayerAddedResult linear_operator = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{out_channels,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          /*inputs=*/
+          {
+              {TensorSlotName::INPUT, t_input_repartitioned},
+          },
+          /*weights=*/
+          {
+              {TensorSlotName::WEIGHT, t_weight_repartitioned},
+          });
+      parallel_tensor_guid_t t_linear =
+          require_only_key(linear_operator.outputs, TensorSlotName::OUTPUT);
+
+      // reduction degree=2 — sums partial results
+      ReductionAttrs reduction_attrs{/*reduction_degree=*/2_p};
+      ParallelLayerAddedResult reduction_operator =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(reduction_attrs),
+                             {{TensorSlotName::INPUT, t_linear}},
+                             /*weights=*/{});
+      parallel_tensor_guid_t t_reduced =
+          require_only_key(reduction_operator.outputs, TensorSlotName::OUTPUT);
+
+      // relu consumer
+      ParallelLayerAddedResult relu_operator =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(make_relu_attrs()),
+                             {{TensorSlotName::INPUT, t_reduced}},
+                             /*weights=*/{});
+
+      MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+      MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
+
+      // input: unsharded on cpu0 — 2 shard dims
+      ParallelTensorSpaceCoordinate input_coord{0_n, 0_n, FFOrdered{0_n, 0_n}};
+
+      // weight: unsharded on cpu0 — 2 shard dims
+      ParallelTensorSpaceCoordinate weight_coord{0_n, 0_n, FFOrdered{0_n, 0_n}};
+
+      // after repartition: input sharded along feature dim
+      ParallelTensorSpaceCoordinate input_repartitioned_coord_0{
+          0_n, 0_n, FFOrdered{0_n, 0_n}};
+      ParallelTensorSpaceCoordinate input_repartitioned_coord_1{
+          0_n, 0_n, FFOrdered{0_n, 1_n}};
+
+      // after repartition: weight sharded along feature dim
+      ParallelTensorSpaceCoordinate weight_repartitioned_coord_0{
+          0_n, 0_n, FFOrdered{0_n, 0_n}};
+      ParallelTensorSpaceCoordinate weight_repartitioned_coord_1{
+          0_n, 0_n, FFOrdered{0_n, 1_n}};
+
+      // linear output: partial sums — sum_component distinguishes them
+      // output has 2 shard dims [{4,1},{4,1}]
+      ParallelTensorSpaceCoordinate linear_coord_0{
+          0_n, 0_n, FFOrdered{0_n, 0_n}};
+      ParallelTensorSpaceCoordinate linear_coord_1{
+          1_n, 0_n, FFOrdered{0_n, 0_n}};
+
+      // reduced output: fully reduced on cpu0
+      ParallelTensorSpaceCoordinate reduced_coord{
+          0_n, 0_n, FFOrdered{0_n, 0_n}};
+
+      MappedParallelComputationGraph mpcg{
+          pcg,
+          {
+              // input: unsharded on cpu0
+              {inputs_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, input_coord}}}}}}},
+              // weight: unsharded on cpu0
+              {weights_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, weight_coord}}}}}}},
+              // input repartition: OUTPUT only
+              {input_repartition_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {cpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::OUTPUT, input_repartitioned_coord_0},
+                    }}},
+                   {cpu1,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::OUTPUT, input_repartitioned_coord_1},
+                    }}},
+               }}},
+              // weight repartition: OUTPUT only
+              {weight_repartition_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {cpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::OUTPUT, weight_repartitioned_coord_0},
+                    }}},
+                   {cpu1,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::OUTPUT, weight_repartitioned_coord_1},
+                    }}},
+               }}},
+              // linear: INPUT + WEIGHT + OUTPUT per device
+              {linear_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {cpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, input_repartitioned_coord_0},
+                        {TensorSlotName::WEIGHT, weight_repartitioned_coord_0},
+                        {TensorSlotName::OUTPUT, linear_coord_0},
+                    }}},
+                   {cpu1,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, input_repartitioned_coord_1},
+                        {TensorSlotName::WEIGHT, weight_repartitioned_coord_1},
+                        {TensorSlotName::OUTPUT, linear_coord_1},
+                    }}},
+               }}},
+              // reduction: INPUT only — OUTPUT coords not distinct
+              {reduction_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {cpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, linear_coord_0},
+                    }}},
+                   {cpu1,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, linear_coord_1},
+                    }}},
+               }}},
+              // relu: on cpu0 only
+              {relu_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {cpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, reduced_coord},
+                        {TensorSlotName::OUTPUT, reduced_coord},
+                    }}},
+               }}},
+          }};
+
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                           /*momentum=*/0.9,
+                                           /*nesterov=*/false,
+                                           /*weight_decay=*/0.001}};
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedFfHandle device_handle =
+          create_distributed_ff_handle(ctx,
+                                       /*workSpaceSize=*/1024 * 1024,
+                                       /*allowTensorOpMathConversion=*/true);
+
+      PCGInstance pcg_instance = create_pcg_instance(ctx,
+                                                     mpcg,
+                                                     optimizer_attrs,
+                                                     std::nullopt,
+                                                     input_tensors,
+                                                     ProfilingSettings{0, 0},
+                                                     device_handle,
+                                                     FFIterationConfig{1_p});
+
+      perform_all_passes_for_pcg_instance(pcg_instance,
+                                          ProfilingSettings{0, 0},
+                                          device_handle,
+                                          FFIterationConfig{1_p});
+    });
+    result.wait();
+  }
+}
+} // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/test_op_repartition.cc b/lib/realm-execution/test/src/realm-execution/test_op_repartition.cc
new file mode 100644
index 0000000000..f900fe3843
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/test_op_repartition.cc
@@ -0,0 +1,167 @@
+#include "internal/realm_test_utils.h"
+#include "kernels/allocation.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "op-attrs/operator_task_space_to_operator_task_space_mapping.h"
+#include "op-attrs/ops/element_unary.h"
+#include "op-attrs/ops/linear.h"
+#include "op-attrs/ops/replicate.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/pcg_instance.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/realm_manager.h"
+#include "task-spec/permissions.h"
+#include "test/utils/doctest/check_kv.h"
+#include "utils/containers/require_only_key.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+template <typename T>
+static ParallelLayerAttrs make_layer_attrs(T const &op_attrs) {
+  return ParallelLayerAttrs{
+      /*op_attrs=*/PCGOperatorAttrs{op_attrs},
+      /*name=*/std::nullopt,
+  };
+};
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE(
+      "RealmBackend e2e Training Repartition Op (CPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+    ControllerTaskResult result =
+        manager.start_controller([](RealmContext &ctx) {
+          Allocator allocator = ctx.get_current_device_allocator();
+
+          positive_int batch_size = 10_p;
+          positive_int data_dim = 16_p;
+
+          TensorShape input_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+          ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+          ParallelLayerAddedResult inputs_layer =
+              pcg_add_input_layer(pcg, input_tensor_shape);
+          parallel_tensor_guid_t t_input =
+              require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+          // repartition along batch dimension (dim 0) with degree 2
+          RepartitionAttrs repartition_attrs{
+              /*repartition_dim=*/ff_dim_t{nonnegative_int{0}},
+              /*repartition_degree=*/2_p,
+          };
+          ParallelLayerAddedResult repartition_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(repartition_attrs),
+                                 {{TensorSlotName::INPUT, t_input}},
+                                 /*weights=*/{});
+          parallel_tensor_guid_t t_repartitioned = require_only_key(
+              repartition_operator.outputs, TensorSlotName::OUTPUT);
+
+          ParallelLayerAddedResult relu_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(make_relu_attrs()),
+                                 {{TensorSlotName::INPUT, t_repartitioned}},
+                                 /*weights=*/{});
+
+          MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+          MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
+
+          // input: one shard on cpu0 (not yet repartitioned)
+          ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+          // after repartition: two shards along dim 0
+          ParallelTensorSpaceCoordinate tensor_coord_shard0{
+              0_n, 0_n, FFOrdered{0_n}};
+          ParallelTensorSpaceCoordinate tensor_coord_shard1{
+              0_n, 0_n, FFOrdered{1_n}};
+
+          MappedParallelComputationGraph mpcg{
+              pcg,
+              {
+                  {inputs_layer.parallel_layer,
+                   MappedOperatorTaskGroup{
+                       {{cpu0,
+                         OperatorAtomicTaskShardBinding{
+                             {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+                  // repartition: OUTPUT only (no INPUT in binding)
+                  {repartition_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {cpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::OUTPUT, tensor_coord_shard0},
+                        }}},
+                       {cpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::OUTPUT, tensor_coord_shard1},
+                        }}},
+                   }}},
+                  {relu_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {cpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_shard0},
+                            {TensorSlotName::OUTPUT, tensor_coord_shard0},
+                        }}},
+                       {cpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_shard1},
+                            {TensorSlotName::OUTPUT, tensor_coord_shard1},
+                        }}},
+                   }}},
+              }};
+
+          OptimizerAttrs optimizer_attrs =
+              OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                               /*momentum=*/0.9,
+                                               /*nesterov=*/false,
+                                               /*weight_decay=*/0.001}};
+
+          std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+              input_tensors;
+
+          DistributedFfHandle device_handle = create_distributed_ff_handle(
+              ctx,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+
+          PCGInstance pcg_instance =
+              create_pcg_instance(ctx,
+                                  mpcg,
+                                  optimizer_attrs,
+                                  std::nullopt,
+                                  input_tensors,
+                                  ProfilingSettings{0, 0},
+                                  device_handle,
+                                  FFIterationConfig{1_p});
+
+          perform_all_passes_for_pcg_instance(pcg_instance,
+                                              ProfilingSettings{0, 0},
+                                              device_handle,
+                                              FFIterationConfig{1_p});
+        });
+    result.wait();
+  }
+}
+} // namespace test
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/parallel_op_utils.h b/lib/task-spec/include/task-spec/dynamic_graph/parallel_op_utils.h
new file mode 100644
index 0000000000..095c9edc41
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/parallel_op_utils.h
@@ -0,0 +1,28 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_PARALLEL_OP_UTILS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_PARALLEL_OP_UTILS_H
+
+#include "op-attrs/ops/combine.h"
+#include "op-attrs/ops/reduction.h"
+#include "op-attrs/ops/repartition.h"
+#include "op-attrs/ops/replicate.h"
+#include "op-attrs/pcg_operator_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
+
+namespace FlexFlow {
+
+inline bool is_parallel_op_attrs(DynamicNodeAttrs const &n) {
+  if (!n.op_attrs.has_value()) {
+    return false;
+  }
+  if (!n.op_attrs.value().has<PCGOperatorAttrs>()) {
+    return false;
+  }
+  PCGOperatorAttrs pcg = n.op_attrs.value().get<PCGOperatorAttrs>();
+  return pcg.has<ReplicateAttrs>() || pcg.has<RepartitionAttrs>() ||
+         pcg.has<CombineAttrs>() || pcg.has<ReductionAttrs>();
+}
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_PARALLEL_OP_UTILS_H
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
index ef41042a51..becb068a1d 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/copy_insertion.cc
@@ -9,6 +9,7 @@
 #include "task-spec/dynamic_graph/dynamic_task_type.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_slot.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/dynamic_graph/parallel_op_utils.h"
 #include "utils/bidict/algorithms/bidict_from_pairs.h"
 #include "utils/bidict/algorithms/unordered_set_of.h"
 #include "utils/containers/contains_key.h"
@@ -25,14 +26,6 @@ bool node_is_copy(DynamicNodeAttrs const &n) {
   return n.op_attrs.has_value() && n.op_attrs.value().is_copy();
 }
 
-static bool is_replicate_invocation(DynamicNodeInvocation const &i) {
-  return i.node_attrs.op_attrs.has_value() &&
-         i.node_attrs.op_attrs.value().has<PCGOperatorAttrs>() &&
-         i.node_attrs.op_attrs.value()
-             .get<PCGOperatorAttrs>()
-             .has<ReplicateAttrs>();
-}
-
 bool value_is_mapped(DynamicValueAttrs const &n) {
   return n.mapping.has_value();
 }
@@ -41,8 +34,8 @@ bool no_part_of_graph_is_copy_inserted(DynamicOpenDataflowGraph const &g) {
   auto slot_is_mapped = [](DynamicTensorSlot const &) -> bool { return false; };
   // check all non-replicate invocations
   for (DynamicNodeInvocation const &i : g.invocations) {
-    if (is_replicate_invocation(i)) {
-      continue; // replicate tensors have mapping set by design
+    if (is_parallel_op_attrs(i.node_attrs)) {
+      continue; // parallel tensors have mapping set by design
     }
     if (node_is_copy(i.node_attrs)) {
       return false;
@@ -110,9 +103,9 @@ std::unordered_set<DynamicNodeInvocation> perform_copy_insertion_for_invocation(
     std::unordered_map<DynamicValueAttrs, DynamicValueAttrs> const
         &unmapped_value_to_mapped_source_value) {
 
-  // replicate nodes have no MappedOperatorTaskGroup —
+  // parallel op nodes have no MappedOperatorTaskGroup —
   // pass through unchanged, no copies needed
-  if (is_replicate_invocation(i)) {
+  if (is_parallel_op_attrs(i.node_attrs)) {
     return {i};
   }
   MappedOperatorTaskGroup mapping = assert_unwrap(i.node_attrs.mapping);
@@ -187,9 +180,9 @@ DynamicOpenDataflowGraph
   std::unordered_map<DynamicValueAttrs, DynamicValueAttrs>
       unmapped_value_to_mapped_source_value;
   for (DynamicNodeInvocation const &i : g.invocations) {
-    // replicate nodes have no MappedOperatorTaskGroup —
+    // parallel op nodes have no MappedOperatorTaskGroup —
     // output mapping already fully set, maps to itself
-    if (is_replicate_invocation(i)) {
+    if (is_parallel_op_attrs(i.node_attrs)) {
       for (auto const &[slot, value] : i.outputs) {
         unmapped_value_to_mapped_source_value.insert(std::pair{value, value});
       }
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
index bf9fe1d3a0..3a668feba1 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc
@@ -1,4 +1,5 @@
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/parallel_op_utils.h"
 #include "utils/containers/all_of.h"
 #include "utils/containers/contains_duplicates.h"
 #include "utils/containers/flatmap.h"
@@ -149,6 +150,13 @@ std::pair<LabelledOpenKwargDataflowGraph<DynamicNodeAttrs,
   for (DynamicNodeInvocation const &invocation :
        get_dynamic_invocation_set(g)) {
     for (DynamicValueAttrs const &output : values(invocation.outputs)) {
+      // combine FWD and reduction FWD have multiple invocations producing
+      // the same output value — only register the first one to avoid
+      // ManyToOne collision while still marking the value as having a producer
+      if (is_parallel_op_attrs(invocation.node_attrs) &&
+          value_to_producer.contains_l(output)) {
+        continue;
+      }
       value_to_producer.insert({output, invocation});
     }
   }
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
index 9349341d4b..3d20f2c027 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mapped_pcg.cc
@@ -14,15 +14,16 @@
 #include <utility>
 
 namespace FlexFlow {
+
 static bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
-    get_input_mapping_for_replicate(
+    get_input_mapping_for_parallel_op(
         MappedParallelComputationGraph const &mpcg,
-        parallel_layer_guid_t const &replicate_layer) {
+        parallel_layer_guid_t const &layer) {
 
   // get_incoming_edges returns map<TensorSlotName, ParallelComputationGraphEdge>
   // replicate has exactly one input
   auto [input_slot_name, input_edge] =
-      get_only(get_incoming_edges(mpcg.pcg, replicate_layer));
+      get_only(get_incoming_edges(mpcg.pcg, layer));
 
   parallel_layer_guid_t producer_layer = get_src_layer(input_edge);
   TensorSlotName producer_slot = get_src_layer_output_slot_name(input_edge);
@@ -49,12 +50,12 @@ static std::unordered_map<parallel_layer_guid_t, TensorSlotName>
 }
 
 static bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
-    build_replicated_output_mapping(
+    build_output_mapping_for_parallel_op(
         MappedParallelComputationGraph const &mpcg,
-        parallel_layer_guid_t const &replicate_layer) {
+        parallel_layer_guid_t const &layer) {
 
   auto [output_slot_name, output_tensor_guid] =
-      get_only(get_outgoing_tensors(mpcg.pcg, replicate_layer));
+      get_only(get_outgoing_tensors(mpcg.pcg, layer));
 
   auto consumers = get_consumers_of_tensor(mpcg, output_tensor_guid);
   ASSERT(!consumers.empty());
@@ -75,9 +76,9 @@ static bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
 }
 
 static DynamicNodeInvocation
-    build_replicate_invocation(parallel_layer_guid_t const &layer,
-                               ReplicateAttrs const &attrs,
-                               MappedParallelComputationGraph const &mpcg) {
+    build_parallel_op_invocation(parallel_layer_guid_t const &layer,
+                                 ParallelLayerAttrs const &attrs,
+                                 MappedParallelComputationGraph const &mpcg) {
   auto [input_slot_name, input_tensor_guid] =
       get_only(get_incoming_tensors(mpcg.pcg, layer));
   auto incoming = get_incoming_tensors(mpcg.pcg, layer);
@@ -87,14 +88,12 @@ static DynamicNodeInvocation
 
   ParallelTensorAttrs input_attrs =
       get_parallel_tensor_attrs(mpcg.pcg, input_tensor_guid);
-  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> input_mapping =
-      get_input_mapping_for_replicate(mpcg, layer);
 
   DynamicValueAttrs input_value{
       /*tensor_guid=*/dynamic_tensor_guid_t{input_tensor_guid},
       /*parallel_tensor_shape=*/input_attrs.shape,
       /*shard_coord=*/std::nullopt,
-      /*mapping=*/get_input_mapping_for_replicate(mpcg, layer),
+      /*mapping=*/get_input_mapping_for_parallel_op(mpcg, layer),
       /*accessor=*/std::nullopt,
       /*role=*/std::nullopt,
   };
@@ -108,7 +107,7 @@ static DynamicNodeInvocation
       /*tensor_guid=*/dynamic_tensor_guid_t{output_tensor_guid},
       /*parallel_tensor_shape=*/output_attrs.shape,
       /*shard_coord=*/std::nullopt,
-      /*mapping=*/build_replicated_output_mapping(mpcg, layer),
+      /*mapping=*/build_output_mapping_for_parallel_op(mpcg, layer),
       /*accessor=*/std::nullopt,
       /*role=*/std::nullopt,
   };
@@ -116,7 +115,7 @@ static DynamicNodeInvocation
       /*task_type=*/std::nullopt,
       /*device_coord=*/std::nullopt,
       /*mapping=*/std::nullopt,
-      /*op_attrs=*/TrainingOperationAttrs{PCGOperatorAttrs{attrs}},
+      /*op_attrs=*/TrainingOperationAttrs{attrs.op_attrs},
       /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
       /*per_device_op_state=*/std::nullopt,
   };
@@ -138,11 +137,11 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mapped_pcg(
   for (auto const &[layer, attrs] :
        get_parallel_layer_attrs_mapping(mpcg.pcg)) {
 
-    if (attrs.op_attrs.has<ReplicateAttrs>()) {
+    if (is_parallel_op(attrs.op_attrs)) {
       // build replicate invocation
-      DynamicNodeInvocation repl_inv = build_replicate_invocation(
-          layer, attrs.op_attrs.get<ReplicateAttrs>(), mpcg);
-      result.invocations.emplace(repl_inv);
+      DynamicNodeInvocation parallel_inv =
+          build_parallel_op_invocation(layer, attrs, mpcg);
+      result.invocations.emplace(parallel_inv);
       continue;
     }
 
@@ -200,7 +199,6 @@ DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mapped_pcg(
 
     result.invocations.emplace(result_inputs, result_attrs, result_outputs);
   }
-
   return result;
 }
 
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
index faa1e186c3..036579c80a 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc
@@ -1,6 +1,7 @@
 #include "task-spec/dynamic_graph/pass_expansion.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_tensor_role.h"
+#include "task-spec/dynamic_graph/parallel_op_utils.h"
 #include "utils/containers/are_all_same.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/merge_disjoint_maps.h"
@@ -30,11 +31,6 @@ bool graph_is_fully_pass_expanded(DynamicOpenDataflowGraph const &g) {
       g, node_is_pass_expanded, value_is_pass_expanded, slot_is_pass_expanded);
 }
 
-static bool is_replicate_attrs(DynamicNodeAttrs const &n) {
-  return n.op_attrs.has_value() && n.op_attrs.value().has<PCGOperatorAttrs>() &&
-         n.op_attrs.value().get<PCGOperatorAttrs>().has<ReplicateAttrs>();
-}
-
 DynamicTensorSlot pass_expand_slot(DynamicTensorSlot const &s,
                                    FwbTensorType tensor_type) {
   ASSERT(!slot_is_pass_expanded(s));
@@ -115,42 +111,49 @@ DynamicNodeInvocation perform_bwd_pass_expansion_for_invocation(
       transform(invocation.inputs, to_grad),
   };
 }
+
 static std::unordered_set<DynamicNodeInvocation>
-    perform_pass_expansion_for_replicate(
+    perform_pass_expansion_for_parallel_op(
         DynamicNodeInvocation const &invocation) {
 
   auto const &[input_slot, input] = get_only(invocation.inputs);
-  auto const &[output_slot, output] = get_only(invocation.outputs);
 
-  // forward: INPUT/FWD → OUTPUT/FWD (copy to replicas)
+  auto to_fwd = [](DynamicTensorSlot const &k, DynamicValueAttrs const &v) {
+    return std::pair{
+        pass_expand_slot(k, FwbTensorType::FORWARD),
+        pass_expand_value(v, FwbTensorType::FORWARD),
+    };
+  };
+
+  auto to_grad = [](DynamicTensorSlot const &k, DynamicValueAttrs const &v) {
+    return std::pair{
+        pass_expand_slot(k, FwbTensorType::GRADIENT),
+        pass_expand_value(v, FwbTensorType::GRADIENT),
+    };
+  };
+
   DynamicNodeInvocation fwd{
       /*inputs=*/{{pass_expand_slot(input_slot, FwbTensorType::FORWARD),
                    pass_expand_value(input, FwbTensorType::FORWARD)}},
       /*node_attrs=*/
       pass_expand_node(invocation.node_attrs, DynamicTaskType::FWD),
-      /*outputs=*/
-      {{pass_expand_slot(output_slot, FwbTensorType::FORWARD),
-        pass_expand_value(output, FwbTensorType::FORWARD)}},
+      /*outputs=*/transform(invocation.outputs, to_fwd),
   };
 
-  // backward: OUTPUT/FWD + OUTPUT/GRAD → INPUT/GRAD (reduce gradients)
-  // The backward node needs the mapping from the output (replicated)
-  // so it knows which replicas to reduce from
-  DynamicNodeAttrs bwd_node_attrs = invocation.node_attrs;
-  bwd_node_attrs.task_type = DynamicTaskType::BWD;
+  DynamicNodeAttrs bwd_node = invocation.node_attrs;
+  bwd_node.task_type = DynamicTaskType::BWD;
 
   DynamicNodeInvocation bwd{
-      /*inputs=*/{
-          {pass_expand_slot(output_slot, FwbTensorType::FORWARD),
-           pass_expand_value(output, FwbTensorType::FORWARD)},
-          {pass_expand_slot(output_slot, FwbTensorType::GRADIENT),
-           pass_expand_value(output, FwbTensorType::GRADIENT)},
-      },
-      /*node_attrs=*/bwd_node_attrs,
+      /*inputs=*/merge_disjoint_maps(std::vector{
+          transform(invocation.outputs, to_fwd),
+          transform(invocation.outputs, to_grad),
+      }),
+      /*node_attrs=*/bwd_node,
       /*outputs=*/
       {{pass_expand_slot(input_slot, FwbTensorType::GRADIENT),
         pass_expand_value(input, FwbTensorType::GRADIENT)}},
   };
+
   return {fwd, bwd};
 }
 
@@ -161,8 +164,8 @@ DynamicOpenDataflowGraph
 
   DynamicOpenDataflowGraph result = flatmap_dynamic_invocation_set(
       g, [](DynamicNodeInvocation const &invocation) {
-        if (is_replicate_attrs(invocation.node_attrs)) {
-          return perform_pass_expansion_for_replicate(invocation);
+        if (is_parallel_op_attrs(invocation.node_attrs)) {
+          return perform_pass_expansion_for_parallel_op(invocation);
         }
         if (invocation.inputs.empty()) {
           return std::unordered_set{
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
index d3365ae44c..c049a35cb1 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -1,6 +1,7 @@
 #include "task-spec/dynamic_graph/shard_expansion.h"
 #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/dynamic_graph/parallel_op_utils.h"
 #include "utils/bidict/algorithms/filter_keys.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/map_values2.h"
@@ -18,6 +19,10 @@ bool value_is_shard_expanded(DynamicValueAttrs const &n) {
   return n.shard_coord.has_value();
 }
 
+static bool has_task_type(DynamicNodeAttrs const &n, DynamicTaskType t) {
+  return n.task_type.has_value() && n.task_type.value() == t;
+}
+
 bool no_part_of_graph_is_shard_expanded(DynamicOpenDataflowGraph const &g) {
   auto slot_is_shard_expanded = [](DynamicTensorSlot const &) -> bool {
     return false;
@@ -85,35 +90,134 @@ static DynamicNodeInvocation shard_invocation_for_binding(
 }
 
 static std::unordered_set<DynamicNodeInvocation>
-    perform_shard_expansion_for_replicate(DynamicNodeInvocation const &i) {
-  auto const &[input_slot, input] = get_only(i.inputs);
-  auto const &[output_slot, output] = get_only(i.outputs);
+    perform_shard_expansion_one_to_many(
+        DynamicNodeInvocation const &i,
+        std::function<ParallelTensorSpaceCoordinate(
+            ParallelTensorSpaceCoordinate const &)> output_to_input_coord) {
 
-  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> input_mapping =
-      assert_unwrap(input.mapping);
-  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate> output_mapping =
-      assert_unwrap(output.mapping);
-
-  return transform(output_mapping.left_values(),
-                   [&](ParallelTensorSpaceCoordinate const &p) {
-                     ParallelTensorSpaceCoordinate input_p{
-                         /*sum_component=*/p.sum_component,
-                         /*discard_copy_component=*/nonnegative_int{0},
-                         /*shard_components=*/p.shard_components,
-                     };
-                     return shard_invocation_for_binding(
-                         i,
-                         output_mapping.at_l(p),
-                         OperatorAtomicTaskShardBinding{{
-                             {input_slot.slot_name, input_p},
-                             {output_slot.slot_name, p},
-                         }});
-                   });
-}
+  if (has_task_type(i.node_attrs, DynamicTaskType::FWD)) {
+    auto const &[input_slot, input] = get_only(i.inputs);
+    auto const &[output_slot, output] = get_only(i.outputs);
+
+    bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+        output_mapping = assert_unwrap(output.mapping);
+
+    return transform(output_mapping.left_values(),
+                     [&](ParallelTensorSpaceCoordinate const &p) {
+                       ParallelTensorSpaceCoordinate input_p =
+                           output_to_input_coord(p);
+                       return shard_invocation_for_binding(
+                           i,
+                           output_mapping.at_l(p),
+                           OperatorAtomicTaskShardBinding{{
+                               {input_slot.slot_name, input_p},
+                               {output_slot.slot_name, p},
+                           }});
+                     });
+  }
+
+  // BWD case — inputs are OUTPUT/FWD and OUTPUT/GRAD, output is INPUT/GRAD
+  std::optional<DynamicValueAttrs> output_grad_opt;
+  std::optional<DynamicValueAttrs> output_fwd_opt;
+  std::optional<DynamicTensorSlot> output_grad_slot_opt;
+  std::optional<DynamicTensorSlot> output_fwd_slot_opt;
+
+  for (auto const &[slot, value] : i.inputs) {
+    if (slot.slot_tensor_role == DynamicTensorRole{FwbTensorType::GRADIENT}) {
+      output_grad_slot_opt = slot;
+      output_grad_opt = value;
+    } else {
+      output_fwd_slot_opt = slot;
+      output_fwd_opt = value;
+    }
+  }
 
+  DynamicValueAttrs output_grad = assert_unwrap(output_grad_opt);
+  DynamicValueAttrs output_fwd = assert_unwrap(output_fwd_opt);
+  DynamicTensorSlot output_grad_slot = assert_unwrap(output_grad_slot_opt);
+  DynamicTensorSlot output_fwd_slot = assert_unwrap(output_fwd_slot_opt);
+  auto const &[input_grad_slot, input_grad] = get_only(i.outputs);
+
+  bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+      input_grad_mapping = assert_unwrap(input_grad.mapping);
+
+  // iterate over input_grad coords (the "many" side)
+  return transform(
+      input_grad_mapping.left_values(),
+      [&](ParallelTensorSpaceCoordinate const &p) {
+        // map input_grad coord to output_grad coord
+        ParallelTensorSpaceCoordinate output_p = output_to_input_coord(p);
+        MachineSpaceCoordinate dst_machine = input_grad_mapping.at_l(p);
+
+        bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+            output_grad_mapping = assert_unwrap(output_grad.mapping);
+
+        DynamicValueAttrs sharded_output_grad = output_grad;
+        sharded_output_grad.mapping =
+            bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{
+                {output_p, output_grad_mapping.at_l(output_p)}};
+        sharded_output_grad.shard_coord = output_p;
+
+        DynamicValueAttrs sharded_output_fwd = output_fwd;
+        sharded_output_fwd.mapping =
+            bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{
+                {output_p, output_grad_mapping.at_l(output_p)}};
+        sharded_output_fwd.shard_coord = output_p;
+
+        DynamicValueAttrs sharded_input_grad = input_grad;
+        sharded_input_grad.mapping =
+            bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{
+                {p, dst_machine}};
+        sharded_input_grad.shard_coord = p;
+
+        DynamicNodeAttrs sharded_node = i.node_attrs;
+        sharded_node.device_coord = dst_machine;
+
+        return DynamicNodeInvocation{
+            /*inputs=*/{
+                {output_fwd_slot, sharded_output_fwd},
+                {output_grad_slot, sharded_output_grad},
+            },
+            /*node_attrs=*/sharded_node,
+            /*outputs=*/
+            {
+                {input_grad_slot, sharded_input_grad},
+            },
+        };
+      });
+}
 static std::unordered_set<DynamicNodeInvocation>
-    perform_shard_expansion_for_replicate_bwd(DynamicNodeInvocation const &i) {
+    perform_shard_expansion_many_to_one(
+        DynamicNodeInvocation const &i,
+        std::function<ParallelTensorSpaceCoordinate(
+            ParallelTensorSpaceCoordinate const &)> input_to_output_coord) {
+
+  if (has_task_type(i.node_attrs, DynamicTaskType::FWD)) {
+    auto const &[input_slot, input] = get_only(i.inputs);
+    auto const &[output_slot, output] = get_only(i.outputs);
+
+    bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+        input_mapping = assert_unwrap(input.mapping);
+    bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
+        output_mapping = assert_unwrap(output.mapping);
+
+    return transform(input_mapping.left_values(),
+                     [&](ParallelTensorSpaceCoordinate const &p) {
+                       ParallelTensorSpaceCoordinate output_p =
+                           input_to_output_coord(p);
+                       MachineSpaceCoordinate dst_machine =
+                           output_mapping.at_l(output_p);
+                       return shard_invocation_for_binding(
+                           i,
+                           dst_machine,
+                           OperatorAtomicTaskShardBinding{{
+                               {input_slot.slot_name, p},
+                               {output_slot.slot_name, output_p},
+                           }});
+                     });
+  }
 
+  // BWD case
   std::optional<DynamicValueAttrs> output_grad_opt;
   std::optional<DynamicValueAttrs> output_fwd_opt;
   std::optional<DynamicTensorSlot> output_grad_slot_opt;
@@ -140,41 +244,43 @@ static std::unordered_set<DynamicNodeInvocation>
   bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
       input_grad_mapping = assert_unwrap(input_grad.mapping);
 
-  std::unordered_map<FFOrdered<nonnegative_int>,
+  // group output_grad coords by their corresponding input_grad coord
+  std::unordered_map<ParallelTensorSpaceCoordinate,
                      std::unordered_set<ParallelTensorSpaceCoordinate>>
-      by_shard;
+      input_grad_to_output_grads;
   for (auto const &p : output_grad_mapping.left_values()) {
-    by_shard[p.shard_components].insert(p);
+    input_grad_to_output_grads[input_to_output_coord(p)].insert(p);
   }
 
   std::unordered_set<DynamicNodeInvocation> result;
-  for (auto const &[shard_components, replica_coords] : by_shard) {
-    ParallelTensorSpaceCoordinate src_p{
-        nonnegative_int{0}, nonnegative_int{0}, shard_components};
-    MachineSpaceCoordinate src_machine = input_grad_mapping.at_l(src_p);
+  for (auto const &[input_grad_p, output_grad_coords] :
+       input_grad_to_output_grads) {
 
+    MachineSpaceCoordinate dst_machine = input_grad_mapping.at_l(input_grad_p);
+
+    // subset output_grad mapping to just this group's coords
     bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>
         replica_mapping;
-    for (auto const &p : replica_coords) {
+    for (auto const &p : output_grad_coords) {
       replica_mapping.equate(p, output_grad_mapping.at_l(p));
     }
 
     DynamicValueAttrs sharded_output_grad = output_grad;
     sharded_output_grad.mapping = replica_mapping;
-    sharded_output_grad.shard_coord = src_p;
+    sharded_output_grad.shard_coord = input_grad_p;
 
     DynamicValueAttrs sharded_output_fwd = output_fwd;
     sharded_output_fwd.mapping = replica_mapping;
-    sharded_output_fwd.shard_coord = src_p;
+    sharded_output_fwd.shard_coord = input_grad_p;
 
     DynamicValueAttrs sharded_input_grad = input_grad;
     sharded_input_grad.mapping =
         bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{
-            {src_p, src_machine}};
-    sharded_input_grad.shard_coord = src_p;
+            {input_grad_p, dst_machine}};
+    sharded_input_grad.shard_coord = input_grad_p;
 
     DynamicNodeAttrs sharded_node = i.node_attrs;
-    sharded_node.device_coord = src_machine;
+    sharded_node.device_coord = dst_machine;
 
     result.insert(DynamicNodeInvocation{
         /*inputs=*/{
@@ -191,6 +297,131 @@ static std::unordered_set<DynamicNodeInvocation>
   return result;
 }
 
+// Replicate/Reduction FWD — output has discard_copy=0..N-1, input always discard_copy=0
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_replicate(DynamicNodeInvocation const &i) {
+  return perform_shard_expansion_one_to_many(
+      i, [](ParallelTensorSpaceCoordinate const &p) {
+        return ParallelTensorSpaceCoordinate{
+            p.sum_component, nonnegative_int{0}, p.shard_components};
+      });
+}
+
+// Replicate BWD — many discard_copy inputs → one discard_copy=0 output
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_replicate_bwd(DynamicNodeInvocation const &i) {
+  return perform_shard_expansion_many_to_one(
+      i, [](ParallelTensorSpaceCoordinate const &p) {
+        return ParallelTensorSpaceCoordinate{
+            p.sum_component, nonnegative_int{0}, p.shard_components};
+      });
+}
+
+// Repartition FWD — output coord (high) → input coord (low)
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_repartition(DynamicNodeInvocation const &i) {
+  RepartitionAttrs attrs = i.node_attrs.op_attrs.value()
+                               .get<PCGOperatorAttrs>()
+                               .get<RepartitionAttrs>();
+  relative_ff_dim_t rel_dim =
+      relative_ff_dim_t_from_ff_dim_t(attrs.repartition_dim);
+  nonnegative_int degree =
+      attrs.repartition_degree.nonnegative_int_from_positive_int();
+
+  return perform_shard_expansion_one_to_many(
+      i, [=](ParallelTensorSpaceCoordinate const &p) {
+        FFOrdered<nonnegative_int> input_shard = p.shard_components;
+        input_shard.at(rel_dim) =
+            p.shard_components.at(rel_dim) / degree; // ← /  not %
+        return ParallelTensorSpaceCoordinate{
+            p.sum_component, p.discard_copy_component, input_shard};
+      });
+}
+
+// Repartition BWD — output_grad coord (high) → input_grad coord (low)
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_repartition_bwd(
+        DynamicNodeInvocation const &i) {
+  RepartitionAttrs attrs = i.node_attrs.op_attrs.value()
+                               .get<PCGOperatorAttrs>()
+                               .get<RepartitionAttrs>();
+  relative_ff_dim_t rel_dim =
+      relative_ff_dim_t_from_ff_dim_t(attrs.repartition_dim);
+  nonnegative_int degree =
+      attrs.repartition_degree.nonnegative_int_from_positive_int();
+
+  return perform_shard_expansion_many_to_one(
+      i, [=](ParallelTensorSpaceCoordinate const &p) {
+        FFOrdered<nonnegative_int> input_shard = p.shard_components;
+        input_shard.at(rel_dim) =
+            p.shard_components.at(rel_dim) / degree; // ← /  not %
+        return ParallelTensorSpaceCoordinate{
+            p.sum_component, p.discard_copy_component, input_shard};
+      });
+}
+
+// Combine FWD — input coord (high) → output coord (low)
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_combine(DynamicNodeInvocation const &i) {
+  CombineAttrs attrs =
+      i.node_attrs.op_attrs.value().get<PCGOperatorAttrs>().get<CombineAttrs>();
+  relative_ff_dim_t rel_dim =
+      relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim);
+  nonnegative_int degree =
+      attrs.combine_degree.nonnegative_int_from_positive_int();
+
+  return perform_shard_expansion_many_to_one(
+      i, [=](ParallelTensorSpaceCoordinate const &p) {
+        FFOrdered<nonnegative_int> output_shard = p.shard_components;
+        output_shard.at(rel_dim) =
+            p.shard_components.at(rel_dim) / degree; // ← correct
+        return ParallelTensorSpaceCoordinate{
+            p.sum_component, p.discard_copy_component, output_shard};
+      });
+}
+
+// Combine BWD — input_grad coord (high) → output_grad coord (low)
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_combine_bwd(DynamicNodeInvocation const &i) {
+  CombineAttrs attrs =
+      i.node_attrs.op_attrs.value().get<PCGOperatorAttrs>().get<CombineAttrs>();
+  relative_ff_dim_t rel_dim =
+      relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim);
+  nonnegative_int degree =
+      attrs.combine_degree.nonnegative_int_from_positive_int();
+
+  return perform_shard_expansion_one_to_many(
+      i, [=](ParallelTensorSpaceCoordinate const &p) {
+        FFOrdered<nonnegative_int> output_shard = p.shard_components;
+        output_shard.at(rel_dim) =
+            p.shard_components.at(rel_dim) / degree; // ← / not %
+        return ParallelTensorSpaceCoordinate{
+            p.sum_component, p.discard_copy_component, output_shard};
+      });
+}
+
+// Reduction FWD — input coord (sum=0..N-1) → output coord (sum=0)
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_reduction(DynamicNodeInvocation const &i) {
+  return perform_shard_expansion_many_to_one(
+      i, [](ParallelTensorSpaceCoordinate const &p) {
+        return ParallelTensorSpaceCoordinate{
+            nonnegative_int{0}, // ← output always has sum=0
+            p.discard_copy_component,
+            p.shard_components};
+      });
+}
+
+// Reduction BWD — output_grad coord (sum=0) → input_grad coord (sum=0..N-1)
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_reduction_bwd(DynamicNodeInvocation const &i) {
+  return perform_shard_expansion_many_to_one(
+      i, [](ParallelTensorSpaceCoordinate const &p) {
+        return ParallelTensorSpaceCoordinate{
+            p.sum_component, nonnegative_int{0}, p.shard_components};
+      });
+}
+
 static std::unordered_set<DynamicNodeInvocation>
     perform_shard_expansion_for_copy(DynamicNodeInvocation const &i) {
   auto [input_slot, input] = get_only(i.inputs);
@@ -220,6 +451,47 @@ static std::unordered_set<DynamicNodeInvocation>
       });
 }
 
+static std::unordered_set<DynamicNodeInvocation>
+    perform_shard_expansion_for_parallel_op(DynamicNodeInvocation const &i) {
+  ASSERT(is_parallel_op_attrs(i.node_attrs));
+
+  PCGOperatorAttrs const pcg =
+      i.node_attrs.op_attrs.value().get<PCGOperatorAttrs>();
+
+  // forward dispatch
+  if (has_task_type(i.node_attrs, DynamicTaskType::FWD)) {
+    if (pcg.has<ReplicateAttrs>()) {
+      return perform_shard_expansion_for_replicate(i);
+    }
+    if (pcg.has<RepartitionAttrs>()) {
+      return perform_shard_expansion_for_repartition(i);
+    }
+    if (pcg.has<CombineAttrs>()) {
+      return perform_shard_expansion_for_combine(i);
+    }
+    if (pcg.has<ReductionAttrs>()) {
+      return perform_shard_expansion_for_reduction(i);
+    }
+  }
+
+  // backward dispatch
+  if (has_task_type(i.node_attrs, DynamicTaskType::BWD)) {
+    if (pcg.has<ReplicateAttrs>()) {
+      return perform_shard_expansion_for_replicate_bwd(i);
+    }
+    if (pcg.has<RepartitionAttrs>()) {
+      return perform_shard_expansion_for_repartition_bwd(i);
+    }
+    if (pcg.has<CombineAttrs>()) {
+      return perform_shard_expansion_for_combine_bwd(i);
+    }
+    if (pcg.has<ReductionAttrs>()) {
+      return perform_shard_expansion_for_reduction_bwd(i);
+    }
+  }
+  PANIC("unhandled parallel op task_type: {}", i.node_attrs.task_type);
+}
+
 std::unordered_set<DynamicNodeInvocation>
     perform_shard_expansion_for_invocation(DynamicNodeInvocation const &i) {
   if (i.node_attrs.op_attrs.has_value() &&
@@ -227,23 +499,8 @@ std::unordered_set<DynamicNodeInvocation>
     return perform_shard_expansion_for_copy(i);
   }
 
-  bool const is_replicate =
-      i.node_attrs.op_attrs.has_value() &&
-      i.node_attrs.op_attrs.value().has<PCGOperatorAttrs>() &&
-      i.node_attrs.op_attrs.value()
-          .get<PCGOperatorAttrs>()
-          .has<ReplicateAttrs>();
-
-  // forward replicate
-  if (is_replicate && i.node_attrs.task_type.has_value() &&
-      i.node_attrs.task_type.value() == DynamicTaskType::FWD) {
-    return perform_shard_expansion_for_replicate(i);
-  }
-
-  // backward replicate
-  if (is_replicate && i.node_attrs.task_type.has_value() &&
-      i.node_attrs.task_type.value() == DynamicTaskType::BWD) {
-    return perform_shard_expansion_for_replicate_bwd(i);
+  if (is_parallel_op_attrs(i.node_attrs)) {
+    return perform_shard_expansion_for_parallel_op(i);
   }
 
   MappedOperatorTaskGroup mapping = assert_unwrap(i.node_attrs.mapping);

From 4fe79a46b134a299698cdd8583605752eb35de72 Mon Sep 17 00:00:00 2001
From: Seema Mirchandaney <seemah@g0003.stanford.edu>
Date: Tue, 28 Apr 2026 15:54:37 -0700
Subject: [PATCH 6/6] Add GPU support for parallel operators in Realm backend

Key changes:
- Add CUDA reduction registration via Realm::Cuda::add_cuda_redop_kernels
- Add apply_cuda/fold_cuda methods to SumReductionFloat and SumReductionDouble
- Add REALM_CUDA_HD decorators for host/device compatibility
- Add atomicAdd GPU paths with pre-Pascal CAS fallback for double
- Move register_reductions() to realm_reduction_cuda.cu compiled with NVCC
- Update CMakeLists.txt to use manual add_library with LANGUAGES CXX CUDA
- Add GPU test cases for Combine, Repartition and Reduction ops

The reduction registration uses create_reduction_op + add_cuda_redop_kernels
which registers both CPU and GPU paths in a single call. Realm handles
per-device kernel translation automatically for all GPUs.
---
 lib/realm-execution/CMakeLists.txt            |  44 +++-
 .../realm-execution/tasks/realm_reduction.h   |  76 +++++-
 .../tasks/realm_task_registry.h               |   6 +-
 .../tasks/cuda/realm_reduction.cu             |  31 +++
 .../tasks/realm_task_registry.cc              |   8 -
 .../src/realm-execution/test_op_combine.cc    | 154 +++++++++++
 .../src/realm-execution/test_op_reduce.cc     | 242 ++++++++++++++++++
 .../realm-execution/test_op_repartition.cc    | 123 +++++++++
 8 files changed, 657 insertions(+), 27 deletions(-)
 create mode 100644 lib/realm-execution/src/realm-execution/tasks/cuda/realm_reduction.cu

diff --git a/lib/realm-execution/CMakeLists.txt b/lib/realm-execution/CMakeLists.txt
index 25a51ada54..67c37b5823 100644
--- a/lib/realm-execution/CMakeLists.txt
+++ b/lib/realm-execution/CMakeLists.txt
@@ -1,13 +1,32 @@
-ff_add_library(
-  NAME
-    realm-execution
-  SRC_PATTERNS
-    src/*.cc
-  PUBLIC_INCLUDE
+project(realm-execution
+  LANGUAGES CXX CUDA)
+
+file(GLOB_RECURSE SRC
+     CONFIGURE_DEPENDS
+     LIST_DIRECTORIES False
+     src/*.cc
+     src/**/*.cc
+     src/cuda/*.cu
+     src/**/*.cu
+)
+
+add_library(
+  realm-execution
+  SHARED
+  ${SRC}
+)
+
+target_include_directories(
+  realm-execution
+  PUBLIC
     include/
-  PRIVATE_INCLUDE
+  PRIVATE
     src/
-  DEPS
+)
+
+target_link_libraries(
+  realm-execution
+  PUBLIC
     compiler
     kernels
     local-execution
@@ -19,4 +38,13 @@ ff_add_library(
     realm
 )
 
+define_ff_vars(realm-execution)
+
+set_target_properties(
+  realm-execution
+  PROPERTIES
+  CUDA_STANDARD 17
+)
+
 add_subdirectory(test)
+
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h b/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
index 512e344824..388b433947 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_reduction.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_REDUCTION_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_REDUCTION_H
 #include "op-attrs/datatype.dtg.h"
+#include <cassert>
 #include <realm.h>
-
 namespace FlexFlow {
 
 /**
@@ -23,11 +23,13 @@ struct SumReductionFloat {
    * \param rhs Value to add
    */
   template <bool EXCLUSIVE>
-  static void apply(LHS &lhs, RHS rhs) {
+  REALM_CUDA_HD static void apply(LHS &lhs, RHS rhs) {
     if (EXCLUSIVE) {
       lhs += rhs;
     } else {
-      // Atomic float add via CAS loop
+#if defined(__CUDA_ARCH__)
+      atomicAdd(&lhs, rhs);
+#else
       union {
         float f;
         int i;
@@ -37,9 +39,15 @@ struct SumReductionFloat {
         new_val.f = old_val.f + rhs;
       } while (
           !__sync_bool_compare_and_swap((int *)&lhs, old_val.i, new_val.i));
+#endif
     }
   }
 
+  template <bool EXCLUSIVE>
+  __device__ static void apply_cuda(LHS &lhs, RHS rhs) {
+    apply<EXCLUSIVE>(lhs, rhs);
+  }
+
   /**
    * \brief Fold two RHS values: rhs1 += rhs2
    * \tparam EXCLUSIVE If true, direct addition; if false, atomic CAS loop
@@ -47,11 +55,13 @@ struct SumReductionFloat {
    * \param rhs2 Value to fold in
    */
   template <bool EXCLUSIVE>
-  static void fold(RHS &rhs1, RHS rhs2) {
+  REALM_CUDA_HD static void fold(RHS &rhs1, RHS rhs2) {
     if (EXCLUSIVE) {
       rhs1 += rhs2;
     } else {
-      // Atomic float add via CAS loop
+#if defined(__CUDA_ARCH__)
+      atomicAdd(&rhs1, rhs2);
+#else
       union {
         float f;
         int i;
@@ -61,8 +71,13 @@ struct SumReductionFloat {
         new_val.f = old_val.f + rhs2;
       } while (
           !__sync_bool_compare_and_swap((int *)&rhs1, old_val.i, new_val.i));
+#endif
     }
   }
+  template <bool EXCLUSIVE>
+  __device__ static void fold_cuda(RHS &rhs1, RHS rhs2) {
+    fold<EXCLUSIVE>(rhs1, rhs2);
+  }
 };
 
 /**
@@ -83,11 +98,24 @@ struct SumReductionDouble {
    * \param rhs Value to add
    */
   template <bool EXCLUSIVE>
-  static void apply(LHS &lhs, RHS rhs) {
+  REALM_CUDA_HD static void apply(LHS &lhs, RHS rhs) {
     if (EXCLUSIVE) {
       lhs += rhs;
     } else {
-      // Atomic double add via CAS loop using long long reinterpretation
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+      atomicAdd(&lhs, rhs);
+#elif defined(__CUDA_ARCH__)
+      // pre-Pascal fallback CAS loop
+      unsigned long long int *addr = (unsigned long long int *)&lhs;
+      unsigned long long int old = *addr, assumed;
+      do {
+        assumed = old;
+        old = atomicCAS(
+            addr,
+            assumed,
+            __double_as_longlong(rhs + __longlong_as_double(assumed)));
+      } while (assumed != old);
+#else
       union {
         double d;
         long long i;
@@ -97,8 +125,13 @@ struct SumReductionDouble {
         new_val.d = old_val.d + rhs;
       } while (!__sync_bool_compare_and_swap(
           (long long *)&lhs, old_val.i, new_val.i));
+#endif
     }
   }
+  template <bool EXCLUSIVE>
+  __device__ static void apply_cuda(LHS &lhs, RHS rhs) {
+    apply<EXCLUSIVE>(lhs, rhs);
+  }
 
   /**
    * \brief Fold two RHS values: rhs1 += rhs2
@@ -107,11 +140,23 @@ struct SumReductionDouble {
    * \param rhs2 Value to fold in
    */
   template <bool EXCLUSIVE>
-  static void fold(RHS &rhs1, RHS rhs2) {
+  REALM_CUDA_HD static void fold(RHS &rhs1, RHS rhs2) {
     if (EXCLUSIVE) {
       rhs1 += rhs2;
     } else {
-      // Atomic double add via CAS loop using long long reinterpretation
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+      atomicAdd(&rhs1, rhs2);
+#elif defined(__CUDA_ARCH__)
+      unsigned long long int *addr = (unsigned long long int *)&rhs1;
+      unsigned long long int old = *addr, assumed;
+      do {
+        assumed = old;
+        old = atomicCAS(
+            addr,
+            assumed,
+            __double_as_longlong(rhs2 + __longlong_as_double(assumed)));
+      } while (assumed != old);
+#else
       union {
         double d;
         long long i;
@@ -121,8 +166,14 @@ struct SumReductionDouble {
         new_val.d = old_val.d + rhs2;
       } while (!__sync_bool_compare_and_swap(
           (long long *)&rhs1, old_val.i, new_val.i));
+#endif
     }
   }
+
+  template <bool EXCLUSIVE>
+  __device__ static void fold_cuda(RHS &rhs1, RHS rhs2) {
+    fold<EXCLUSIVE>(rhs1, rhs2);
+  }
 };
 
 /**
@@ -147,7 +198,12 @@ inline Realm::ReductionOpID get_sum_reduction_op_id(DataType dtype) {
     case DataType::DOUBLE:
       return REDOP_SUM_DOUBLE;
     default:
-      PANIC("no sum reduction registered for datatype {}", dtype);
+#ifndef __CUDA_ARCH__
+      throw std::runtime_error("no sum reduction registered for datatype");
+#else
+      assert(false);
+      return REDOP_SUM_FLOAT; //unreachable
+#endif
   }
 }
 } // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
index a956d53643..0c0b24c826 100644
--- a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -27,7 +27,11 @@ namespace FlexFlow {
  * else Realm may not shut down properly.
  */
 [[nodiscard]] Realm::Event register_all_tasks();
-
+/**
+ * \brief Registers Realm sum reduction operators for supported data types.
+ * Defined in realm_reduction_cuda.cu — compiled with CUDA for GPU atomic support.
+ */
+void register_reductions();
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-execution/src/realm-execution/tasks/cuda/realm_reduction.cu b/lib/realm-execution/src/realm-execution/tasks/cuda/realm_reduction.cu
new file mode 100644
index 0000000000..7755490128
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/cuda/realm_reduction.cu
@@ -0,0 +1,31 @@
+// realm_reduction_cuda.cu
+#include "realm-execution/tasks/realm_reduction.h"
+#include <realm.h>
+#include <realm/cuda/cuda_redop.h>
+#include <realm/redop.h>
+
+namespace FlexFlow {
+
+void register_reductions() {
+  ::Realm::Runtime rt = ::Realm::Runtime::get_runtime();
+
+  // register SumReductionFloat with CUDA kernels
+  {
+    ::Realm::ReductionOpUntyped *redop =
+        ::Realm::ReductionOpUntyped::create_reduction_op<SumReductionFloat>();
+    ::Realm::Cuda::add_cuda_redop_kernels<SumReductionFloat>(redop);
+    bool ok = rt.register_reduction(REDOP_SUM_FLOAT, redop);
+    assert(ok && "Failed to register SumReductionFloat");
+  }
+
+  // register SumReductionDouble with CUDA kernels
+  {
+    ::Realm::ReductionOpUntyped *redop =
+        ::Realm::ReductionOpUntyped::create_reduction_op<SumReductionDouble>();
+    ::Realm::Cuda::add_cuda_redop_kernels<SumReductionDouble>(redop);
+    bool ok = rt.register_reduction(REDOP_SUM_DOUBLE, redop);
+    assert(ok && "Failed to register SumReductionDouble");
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
index acafdf59fd..df004146d4 100644
--- a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -31,14 +31,6 @@ Realm::Event register_task(Realm::Processor::Kind target_kind,
       Realm::ProfilingRequestSet());
 }
 
-static void register_reductions() {
-  // register sum reduction ops
-  Realm::Runtime rt = Realm::Runtime::get_runtime();
-  rt.register_reduction<SumReductionFloat>(REDOP_SUM_FLOAT);
-  rt.register_reduction<SumReductionDouble>(REDOP_SUM_DOUBLE);
-  // register_reduction is synchronous — no event returned
-}
-
 Realm::Event register_all_tasks() {
   std::vector<Realm::Event> pending_registrations;
 
diff --git a/lib/realm-execution/test/src/realm-execution/test_op_combine.cc b/lib/realm-execution/test/src/realm-execution/test_op_combine.cc
index 1eb953fe93..47e5ea8175 100644
--- a/lib/realm-execution/test/src/realm-execution/test_op_combine.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_op_combine.cc
@@ -195,4 +195,158 @@ TEST_SUITE(FF_TEST_SUITE) {
     result.wait();
   }
 }
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training Combine Op (GPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/2_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+
+    ControllerTaskResult result =
+        manager.start_controller([](RealmContext &ctx) {
+          Allocator allocator = ctx.get_current_device_allocator();
+
+          positive_int batch_size = 10_p;
+          positive_int data_dim = 16_p;
+
+          TensorShape input_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+          ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+          // input layer
+          ParallelLayerAddedResult inputs_layer =
+              pcg_add_input_layer(pcg, input_tensor_shape);
+          parallel_tensor_guid_t t_input =
+              require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+          // repartition along dim 0 with degree 2
+          // needed so combine has a degree=2 sharded tensor to combine
+          RepartitionAttrs repartition_attrs{
+              /*repartition_dim=*/ff_dim_t{nonnegative_int{0}},
+              /*repartition_degree=*/2_p,
+          };
+          ParallelLayerAddedResult repartition_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(repartition_attrs),
+                                 {{TensorSlotName::INPUT, t_input}},
+                                 /*weights=*/{});
+          parallel_tensor_guid_t t_repartitioned = require_only_key(
+              repartition_operator.outputs, TensorSlotName::OUTPUT);
+
+          // combine along dim 0 with degree 2
+          CombineAttrs combine_attrs{
+              /*combine_dim=*/ff_dim_t{nonnegative_int{0}},
+              /*combine_degree=*/2_p,
+          };
+          ParallelLayerAddedResult combine_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(combine_attrs),
+                                 {{TensorSlotName::INPUT, t_repartitioned}},
+                                 /*weights=*/{});
+          parallel_tensor_guid_t t_combined = require_only_key(
+              combine_operator.outputs, TensorSlotName::OUTPUT);
+
+          // relu consumer
+          ParallelLayerAddedResult relu_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(make_relu_attrs()),
+                                 {{TensorSlotName::INPUT, t_combined}},
+                                 /*weights=*/{});
+
+          MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
+          MachineSpaceCoordinate gpu1{0_n, 1_n, DeviceType::GPU};
+
+          // input: one shard on gpu0 (not yet repartitioned)
+          ParallelTensorSpaceCoordinate tensor_coord0{
+              0_n, 0_n, FFOrdered{0_n, 0_n}};
+          // after repartition: two shards along dim 0
+          ParallelTensorSpaceCoordinate tensor_coord_shard0{
+              0_n, 0_n, FFOrdered{0_n, 0_n}};
+          ParallelTensorSpaceCoordinate tensor_coord_shard1{
+              0_n, 0_n, FFOrdered{1_n, 0_n}};
+          // after combine: one shard on gpu0
+          ParallelTensorSpaceCoordinate tensor_coord_combined{
+              0_n, 0_n, FFOrdered{0_n, 0_n}};
+
+          MappedParallelComputationGraph mpcg{
+              pcg,
+              {
+                  // input: one shard on gpu0
+                  {inputs_layer.parallel_layer,
+                   MappedOperatorTaskGroup{
+                       {{gpu0,
+                         OperatorAtomicTaskShardBinding{
+                             {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+                  // repartition: OUTPUT only — no INPUT since all replicas
+                  // read same source coord violating bidict uniqueness
+                  {repartition_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {gpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::OUTPUT, tensor_coord_shard0},
+                        }}},
+                       {gpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::OUTPUT, tensor_coord_shard1},
+                        }}},
+                   }}},
+                  // combine: two inputs → one output on gpu0
+                  {combine_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {gpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_shard0},
+                        }}},
+                       {gpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_shard1},
+                        }}},
+                   }}},
+                  // relu: one shard on gpu0
+                  {relu_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {gpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_combined},
+                            {TensorSlotName::OUTPUT, tensor_coord_combined},
+                        }}},
+                   }}},
+              }};
+
+          OptimizerAttrs optimizer_attrs =
+              OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                               /*momentum=*/0.9,
+                                               /*nesterov=*/false,
+                                               /*weight_decay=*/0.001}};
+
+          std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+              input_tensors;
+
+          DistributedFfHandle device_handle = create_distributed_ff_handle(
+              ctx,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+
+          PCGInstance pcg_instance =
+              create_pcg_instance(ctx,
+                                  mpcg,
+                                  optimizer_attrs,
+                                  std::nullopt,
+                                  input_tensors,
+                                  ProfilingSettings{0, 0},
+                                  device_handle,
+                                  FFIterationConfig{1_p});
+
+          perform_all_passes_for_pcg_instance(pcg_instance,
+                                              ProfilingSettings{0, 0},
+                                              device_handle,
+                                              FFIterationConfig{1_p});
+        });
+    result.wait();
+  }
+}
 } // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/test_op_reduce.cc b/lib/realm-execution/test/src/realm-execution/test_op_reduce.cc
index 9648f68898..f472ccb96b 100644
--- a/lib/realm-execution/test/src/realm-execution/test_op_reduce.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_op_reduce.cc
@@ -285,4 +285,246 @@ TEST_SUITE(FF_TEST_SUITE) {
     result.wait();
   }
 }
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training Reduction Op (GPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/2_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+    ControllerTaskResult result = manager.start_controller([](RealmContext
+                                                                  &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      positive_int batch_size = 4_p;
+      positive_int in_channels = 8_p;
+      positive_int out_channels = 4_p;
+
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, in_channels}}, DataType::FLOAT};
+
+      TensorShape weight_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{out_channels, in_channels}}, DataType::FLOAT};
+
+      ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+      // input layer
+      ParallelLayerAddedResult inputs_layer =
+          pcg_add_input_layer(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input =
+          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+      // weight layer
+      ParallelLayerAddedResult weights_layer =
+          pcg_add_input_layer(pcg, weight_tensor_shape);
+      parallel_tensor_guid_t t_weight =
+          require_only_key(weights_layer.outputs, TensorSlotName::OUTPUT);
+
+      // repartition input along feature dim (dim 1) with degree 2
+      RepartitionAttrs input_repartition_attrs{
+          /*repartition_dim=*/ff_dim_t{nonnegative_int{1}},
+          /*repartition_degree=*/2_p,
+      };
+      ParallelLayerAddedResult input_repartition_operator =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(input_repartition_attrs),
+                             {{TensorSlotName::INPUT, t_input}},
+                             /*weights=*/{});
+      parallel_tensor_guid_t t_input_repartitioned = require_only_key(
+          input_repartition_operator.outputs, TensorSlotName::OUTPUT);
+
+      // repartition weight along feature dim (dim 1) with degree 2
+      // to match the repartitioned input
+      RepartitionAttrs weight_repartition_attrs{
+          /*repartition_dim=*/ff_dim_t{nonnegative_int{1}},
+          /*repartition_degree=*/2_p,
+      };
+      ParallelLayerAddedResult weight_repartition_operator =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(weight_repartition_attrs),
+                             {{TensorSlotName::INPUT, t_weight}},
+                             /*weights=*/{});
+      parallel_tensor_guid_t t_weight_repartitioned = require_only_key(
+          weight_repartition_operator.outputs, TensorSlotName::OUTPUT);
+
+      // linear with repartitioned input and weight
+      // shard_dim[-1]=2 → sum_degree=2 output
+      ParallelLayerAddedResult linear_operator = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{out_channels,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          /*inputs=*/
+          {
+              {TensorSlotName::INPUT, t_input_repartitioned},
+          },
+          /*weights=*/
+          {
+              {TensorSlotName::WEIGHT, t_weight_repartitioned},
+          });
+      parallel_tensor_guid_t t_linear =
+          require_only_key(linear_operator.outputs, TensorSlotName::OUTPUT);
+
+      // reduction degree=2 — sums partial results
+      ReductionAttrs reduction_attrs{/*reduction_degree=*/2_p};
+      ParallelLayerAddedResult reduction_operator =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(reduction_attrs),
+                             {{TensorSlotName::INPUT, t_linear}},
+                             /*weights=*/{});
+      parallel_tensor_guid_t t_reduced =
+          require_only_key(reduction_operator.outputs, TensorSlotName::OUTPUT);
+
+      // relu consumer
+      ParallelLayerAddedResult relu_operator =
+          add_parallel_layer(pcg,
+                             make_layer_attrs(make_relu_attrs()),
+                             {{TensorSlotName::INPUT, t_reduced}},
+                             /*weights=*/{});
+
+      MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
+      MachineSpaceCoordinate gpu1{0_n, 1_n, DeviceType::GPU};
+
+      // input: unsharded on gpu0 — 2 shard dims
+      ParallelTensorSpaceCoordinate input_coord{0_n, 0_n, FFOrdered{0_n, 0_n}};
+
+      // weight: unsharded on gpu0 — 2 shard dims
+      ParallelTensorSpaceCoordinate weight_coord{0_n, 0_n, FFOrdered{0_n, 0_n}};
+
+      // after repartition: input sharded along feature dim
+      ParallelTensorSpaceCoordinate input_repartitioned_coord_0{
+          0_n, 0_n, FFOrdered{0_n, 0_n}};
+      ParallelTensorSpaceCoordinate input_repartitioned_coord_1{
+          0_n, 0_n, FFOrdered{0_n, 1_n}};
+
+      // after repartition: weight sharded along feature dim
+      ParallelTensorSpaceCoordinate weight_repartitioned_coord_0{
+          0_n, 0_n, FFOrdered{0_n, 0_n}};
+      ParallelTensorSpaceCoordinate weight_repartitioned_coord_1{
+          0_n, 0_n, FFOrdered{0_n, 1_n}};
+
+      // linear output: partial sums — sum_component distinguishes them
+      // output has 2 shard dims [{4,1},{4,1}]
+      ParallelTensorSpaceCoordinate linear_coord_0{
+          0_n, 0_n, FFOrdered{0_n, 0_n}};
+      ParallelTensorSpaceCoordinate linear_coord_1{
+          1_n, 0_n, FFOrdered{0_n, 0_n}};
+
+      // reduced output: fully reduced on gpu0
+      ParallelTensorSpaceCoordinate reduced_coord{
+          0_n, 0_n, FFOrdered{0_n, 0_n}};
+
+      MappedParallelComputationGraph mpcg{
+          pcg,
+          {
+              // input: unsharded on gpu0
+              {inputs_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, input_coord}}}}}}},
+              // weight: unsharded on gpu0
+              {weights_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, weight_coord}}}}}}},
+              // input repartition: OUTPUT only
+              {input_repartition_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {gpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::OUTPUT, input_repartitioned_coord_0},
+                    }}},
+                   {gpu1,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::OUTPUT, input_repartitioned_coord_1},
+                    }}},
+               }}},
+              // weight repartition: OUTPUT only
+              {weight_repartition_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {gpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::OUTPUT, weight_repartitioned_coord_0},
+                    }}},
+                   {gpu1,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::OUTPUT, weight_repartitioned_coord_1},
+                    }}},
+               }}},
+              // linear: INPUT + WEIGHT + OUTPUT per device
+              {linear_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {gpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, input_repartitioned_coord_0},
+                        {TensorSlotName::WEIGHT, weight_repartitioned_coord_0},
+                        {TensorSlotName::OUTPUT, linear_coord_0},
+                    }}},
+                   {gpu1,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, input_repartitioned_coord_1},
+                        {TensorSlotName::WEIGHT, weight_repartitioned_coord_1},
+                        {TensorSlotName::OUTPUT, linear_coord_1},
+                    }}},
+               }}},
+              // reduction: INPUT only — OUTPUT coords not distinct
+              {reduction_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {gpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, linear_coord_0},
+                    }}},
+                   {gpu1,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, linear_coord_1},
+                    }}},
+               }}},
+              // relu: on gpu0 only
+              {relu_operator.parallel_layer,
+               MappedOperatorTaskGroup{{
+                   {gpu0,
+                    OperatorAtomicTaskShardBinding{{
+                        {TensorSlotName::INPUT, reduced_coord},
+                        {TensorSlotName::OUTPUT, reduced_coord},
+                    }}},
+               }}},
+          }};
+
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                           /*momentum=*/0.9,
+                                           /*nesterov=*/false,
+                                           /*weight_decay=*/0.001}};
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedFfHandle device_handle =
+          create_distributed_ff_handle(ctx,
+                                       /*workSpaceSize=*/1024 * 1024,
+                                       /*allowTensorOpMathConversion=*/true);
+
+      PCGInstance pcg_instance = create_pcg_instance(ctx,
+                                                     mpcg,
+                                                     optimizer_attrs,
+                                                     std::nullopt,
+                                                     input_tensors,
+                                                     ProfilingSettings{0, 0},
+                                                     device_handle,
+                                                     FFIterationConfig{1_p});
+
+      perform_all_passes_for_pcg_instance(pcg_instance,
+                                          ProfilingSettings{0, 0},
+                                          device_handle,
+                                          FFIterationConfig{1_p});
+    });
+    result.wait();
+  }
+}
 } // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/test_op_repartition.cc b/lib/realm-execution/test/src/realm-execution/test_op_repartition.cc
index f900fe3843..5974becae0 100644
--- a/lib/realm-execution/test/src/realm-execution/test_op_repartition.cc
+++ b/lib/realm-execution/test/src/realm-execution/test_op_repartition.cc
@@ -164,4 +164,127 @@ TEST_SUITE(FF_TEST_SUITE) {
     result.wait();
   }
 }
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE(
+      "RealmBackend e2e Training Repartition Op (GPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/2_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+    ControllerTaskResult result =
+        manager.start_controller([](RealmContext &ctx) {
+          Allocator allocator = ctx.get_current_device_allocator();
+
+          positive_int batch_size = 10_p;
+          positive_int data_dim = 16_p;
+
+          TensorShape input_tensor_shape = TensorShape{
+              TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+          ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+          ParallelLayerAddedResult inputs_layer =
+              pcg_add_input_layer(pcg, input_tensor_shape);
+          parallel_tensor_guid_t t_input =
+              require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+          // repartition along batch dimension (dim 0) with degree 2
+          RepartitionAttrs repartition_attrs{
+              /*repartition_dim=*/ff_dim_t{nonnegative_int{0}},
+              /*repartition_degree=*/2_p,
+          };
+          ParallelLayerAddedResult repartition_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(repartition_attrs),
+                                 {{TensorSlotName::INPUT, t_input}},
+                                 /*weights=*/{});
+          parallel_tensor_guid_t t_repartitioned = require_only_key(
+              repartition_operator.outputs, TensorSlotName::OUTPUT);
+
+          ParallelLayerAddedResult relu_operator =
+              add_parallel_layer(pcg,
+                                 make_layer_attrs(make_relu_attrs()),
+                                 {{TensorSlotName::INPUT, t_repartitioned}},
+                                 /*weights=*/{});
+
+          MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
+          MachineSpaceCoordinate gpu1{0_n, 1_n, DeviceType::GPU};
+
+          // input: one shard on gpu0 (not yet repartitioned)
+          ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+          // after repartition: two shards along dim 0
+          ParallelTensorSpaceCoordinate tensor_coord_shard0{
+              0_n, 0_n, FFOrdered{0_n}};
+          ParallelTensorSpaceCoordinate tensor_coord_shard1{
+              0_n, 0_n, FFOrdered{1_n}};
+
+          MappedParallelComputationGraph mpcg{
+              pcg,
+              {
+                  {inputs_layer.parallel_layer,
+                   MappedOperatorTaskGroup{
+                       {{gpu0,
+                         OperatorAtomicTaskShardBinding{
+                             {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+                  // repartition: OUTPUT only (no INPUT in binding)
+                  {repartition_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {gpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::OUTPUT, tensor_coord_shard0},
+                        }}},
+                       {gpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::OUTPUT, tensor_coord_shard1},
+                        }}},
+                   }}},
+                  {relu_operator.parallel_layer,
+                   MappedOperatorTaskGroup{{
+                       {gpu0,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_shard0},
+                            {TensorSlotName::OUTPUT, tensor_coord_shard0},
+                        }}},
+                       {gpu1,
+                        OperatorAtomicTaskShardBinding{{
+                            {TensorSlotName::INPUT, tensor_coord_shard1},
+                            {TensorSlotName::OUTPUT, tensor_coord_shard1},
+                        }}},
+                   }}},
+              }};
+
+          OptimizerAttrs optimizer_attrs =
+              OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                               /*momentum=*/0.9,
+                                               /*nesterov=*/false,
+                                               /*weight_decay=*/0.001}};
+
+          std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+              input_tensors;
+
+          DistributedFfHandle device_handle = create_distributed_ff_handle(
+              ctx,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+
+          PCGInstance pcg_instance =
+              create_pcg_instance(ctx,
+                                  mpcg,
+                                  optimizer_attrs,
+                                  std::nullopt,
+                                  input_tensors,
+                                  ProfilingSettings{0, 0},
+                                  device_handle,
+                                  FFIterationConfig{1_p});
+
+          perform_all_passes_for_pcg_instance(pcg_instance,
+                                              ProfilingSettings{0, 0},
+                                              device_handle,
+                                              FFIterationConfig{1_p});
+        });
+    result.wait();
+  }
+}
 } // namespace test