flexflow · seemamirch · Apr 9, 2026 · Apr 9, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -41,7 +41,7 @@ TensorDims get_piece_dims(ParallelTensorDims const &);
 TensorDims get_tensor_dims_unsafe(ParallelTensorDims const &);
 
 TensorDims get_reduced_dims(ParallelTensorDims const &);
-
+TensorDims get_per_device_dims(ParallelTensorDims const &dims);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -63,6 +63,8 @@ ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape,
 std::unordered_set<parallel_tensor_dim_idx_t>
     get_parallel_tensor_dim_indices(ParallelTensorShape const &shape);
 
+TensorShape get_per_device_shape(ParallelTensorShape const &s);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/src/op-attrs/ops/element_unary.cc
@@ -35,7 +35,6 @@ ParallelTensorDimDegrees get_output_parallel_dim_degrees(
     ElementUnaryAttrs const &attrs,
     ParallelTensorDimDegrees const &input_degrees) {
   ASSERT(input_degrees.sum_degree.value == 1);
-  ASSERT(input_degrees.discard_copy_degree.value == 1);
 
   return input_degrees;
 }

diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -127,4 +127,12 @@ TensorDims get_reduced_dims(ParallelTensorDims const &dims) {
   return TensorDims{dim_sizes};
 }
 
+TensorDims get_per_device_dims(ParallelTensorDims const &dims) {
+  FFOrdered<positive_int> dim_sizes =
+      transform(dims.shard_dims, [](ShardParallelDim const &d) {
+        return positive_int{d.size.int_from_positive_int() /
+                            d.degree.int_from_positive_int()};
+      });
+  return TensorDims{dim_sizes};
+}
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
@@ -150,4 +150,11 @@ std::unordered_set<parallel_tensor_dim_idx_t>
   return indices;
 }
 
+// actual per-device allocation size
+TensorShape get_per_device_shape(ParallelTensorShape const &s) {
+  return TensorShape{
+      get_per_device_dims(s.dims),
+      s.data_type,
+  };
+}
 } // namespace FlexFlow
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
@@ -61,14 +61,5 @@ TEST_SUITE(FF_TEST_SUITE) {
           make_input(
               SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
     }
-
-    SUBCASE("discard copy degree > 1") {
-      positive_int degree = 2_p;
-
-      CHECK_THROWS(get_output_shape(
-          attrs,
-          make_input(
-              SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p)));
-    }
   }
 }
diff --git a/lib/realm-execution/CMakeLists.txt b/lib/realm-execution/CMakeLists.txt
@@ -1,13 +1,32 @@
-ff_add_library(
-  NAME
-    realm-execution
-  SRC_PATTERNS
-    src/*.cc
-  PUBLIC_INCLUDE
+project(realm-execution
+  LANGUAGES CXX CUDA)
+
+file(GLOB_RECURSE SRC
+     CONFIGURE_DEPENDS
+     LIST_DIRECTORIES False
+     src/*.cc
+     src/**/*.cc
+     src/cuda/*.cu
+     src/**/*.cu
+)
+
+add_library(
+  realm-execution
+  SHARED
+  ${SRC}
+)
+
+target_include_directories(
+  realm-execution
+  PUBLIC
     include/
-  PRIVATE_INCLUDE
+  PRIVATE
     src/
-  DEPS
+)
+
+target_link_libraries(
+  realm-execution
+  PUBLIC
     compiler
     kernels
     local-execution
@@ -19,4 +38,13 @@ ff_add_library(
     realm
 )
 
+define_ff_vars(realm-execution)
+
+set_target_properties(
+  realm-execution
+  PROPERTIES
+  CUDA_STANDARD 17
+)
+
 add_subdirectory(test)
+
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -1,3 +1,4 @@
+
 #ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
 #define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
 
@@ -15,6 +16,11 @@
 
 namespace FlexFlow {
 
+enum class CopyDomain {
+  SRC, // use src instance index space as copy domain (default)
+  DST, // use dst instance index space as copy domain
+};
+
 /**
  * @brief An interface that wraps the rest of Realm and protects against certain
  * classes of bugs, such as shutdown bugs.
@@ -63,17 +69,20 @@ struct RealmContext {
                             int priority = 0);
   ///\}
 
-  /** \name Data movement */
+  /** \name Data movement and reduction */
   ///\{
-  Realm::Event issue_copy(ParallelTensorShape const &src_shape,
-                          Realm::RegionInstance src_inst,
-                          ParallelTensorShape const &dst_shape,
-                          Realm::RegionInstance dst_inst,
-                          Realm::ProfilingRequestSet const &requests,
-                          Realm::Event wait_on = Realm::Event::NO_EVENT,
-                          int priority = 0);
+  Realm::Event
+      issue_copy(ParallelTensorShape const &src_shape,
+                 Realm::RegionInstance src_inst,
+                 ParallelTensorShape const &dst_shape,
+                 Realm::RegionInstance dst_inst,
+                 Realm::ProfilingRequestSet const &requests,
+                 Realm::Event wait_on = Realm::Event::NO_EVENT,
+                 int priority = 0,
+                 std::optional<Realm::ReductionOpID> redop_id = std::nullopt,
+                 bool exclusive = false,
+                 CopyDomain domain = CopyDomain::SRC);
   ///\}
-
   /** \name Instance management */
   ///\{
   std::pair<Realm::RegionInstance, Realm::Event>
@@ -88,6 +97,50 @@ struct RealmContext {
    */
   Realm::Event get_outstanding_events();
 
+  /**
+ * \brief Create a Realm region instance with an offset index space.
+ *
+ * Similar to \ref create_instance, but allocates the instance with a
+ * non-zero origin rect. This is used for sharded tensors where each
+ * shard occupies a sub-region of the full logical tensor's index space.
+ *
+ * For example, given a tensor of shape [10, 16] split along dim 0
+ * with degree 2:
+ * - Shard 0 is allocated with rect [0..4, 0..15]
+ * - Shard 1 is allocated with rect [5..9, 0..15]
+ *
+ * This allows plain Realm copies between shards and the combined tensor
+ * to work correctly — points in each shard's index space match the
+ * corresponding points in the combined tensor's index space, so Realm
+ * copies data to the correct region without needing affine indirection.
+ *
+ * \param memory The Realm memory in which to allocate the instance.
+ * \param shape The per-device tensor shape (already divided by degree).
+ *              Determines the size of the instance.
+ * \param offsets Per-dimension offsets into the full logical tensor.
+ *                \p offsets[i] is the starting index along dimension i.
+ *                For shard k along dim d with piece_size p:
+ *                \p offsets[d] = k * p.
+ * \param prs Realm profiling request set.
+ * \param wait_on Event to wait on before creating the instance.
+ * \return A pair of the created \ref Realm::RegionInstance and a
+ *         \ref Realm::Event that fires when the instance is ready.
+ *
+ * \note The instance's index space has origin at \p offsets, not at
+ *       zero. Copies to/from this instance must use its actual index
+ *       space (via \c get_indexspace()) rather than a reconstructed
+ *       zero-based index space.
+ *
+ * \see create_instance
+ * \see perform_instance_allocation_for_value
+ */
+  std::pair<Realm::RegionInstance, Realm::Event> create_instance_with_offset(
+      Realm::Memory memory,
+      TensorShape const &shape,
+      std::vector<int> const &offsets,
+      Realm::ProfilingRequestSet const &prs,
+      Realm::Event wait_on = Realm::Event::NO_EVENT);
+
 protected:
   /**
    * \brief Compact **and clear** the outstanding event queue