Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ TensorDims get_piece_dims(ParallelTensorDims const &);
TensorDims get_tensor_dims_unsafe(ParallelTensorDims const &);

TensorDims get_reduced_dims(ParallelTensorDims const &);

TensorDims get_per_device_dims(ParallelTensorDims const &dims);
} // namespace FlexFlow

#endif
2 changes: 2 additions & 0 deletions lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape,
std::unordered_set<parallel_tensor_dim_idx_t>
get_parallel_tensor_dim_indices(ParallelTensorShape const &shape);

TensorShape get_per_device_shape(ParallelTensorShape const &s);

} // namespace FlexFlow

#endif
1 change: 0 additions & 1 deletion lib/op-attrs/src/op-attrs/ops/element_unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ ParallelTensorDimDegrees get_output_parallel_dim_degrees(
ElementUnaryAttrs const &attrs,
ParallelTensorDimDegrees const &input_degrees) {
ASSERT(input_degrees.sum_degree.value == 1);
ASSERT(input_degrees.discard_copy_degree.value == 1);

return input_degrees;
}
Expand Down
8 changes: 8 additions & 0 deletions lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,12 @@ TensorDims get_reduced_dims(ParallelTensorDims const &dims) {
return TensorDims{dim_sizes};
}

TensorDims get_per_device_dims(ParallelTensorDims const &dims) {
FFOrdered<positive_int> dim_sizes =
transform(dims.shard_dims, [](ShardParallelDim const &d) {
return positive_int{d.size.int_from_positive_int() /
d.degree.int_from_positive_int()};
});
return TensorDims{dim_sizes};
}
} // namespace FlexFlow
7 changes: 7 additions & 0 deletions lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,11 @@ std::unordered_set<parallel_tensor_dim_idx_t>
return indices;
}

// actual per-device allocation size
TensorShape get_per_device_shape(ParallelTensorShape const &s) {
return TensorShape{
get_per_device_dims(s.dims),
s.data_type,
};
}
} // namespace FlexFlow
9 changes: 0 additions & 9 deletions lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,5 @@ TEST_SUITE(FF_TEST_SUITE) {
make_input(
SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
}

SUBCASE("discard copy degree > 1") {
positive_int degree = 2_p;

CHECK_THROWS(get_output_shape(
attrs,
make_input(
SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p)));
}
}
}
44 changes: 36 additions & 8 deletions lib/realm-execution/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,32 @@
ff_add_library(
NAME
realm-execution
SRC_PATTERNS
src/*.cc
PUBLIC_INCLUDE
project(realm-execution
LANGUAGES CXX CUDA)

file(GLOB_RECURSE SRC
CONFIGURE_DEPENDS
LIST_DIRECTORIES False
src/*.cc
src/**/*.cc
src/cuda/*.cu
src/**/*.cu
)

add_library(
realm-execution
SHARED
${SRC}
)

target_include_directories(
realm-execution
PUBLIC
include/
PRIVATE_INCLUDE
PRIVATE
src/
DEPS
)

target_link_libraries(
realm-execution
PUBLIC
compiler
kernels
local-execution
Expand All @@ -19,4 +38,13 @@ ff_add_library(
realm
)

define_ff_vars(realm-execution)

set_target_properties(
realm-execution
PROPERTIES
CUDA_STANDARD 17
)

add_subdirectory(test)

71 changes: 62 additions & 9 deletions lib/realm-execution/include/realm-execution/realm_context.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H

Expand All @@ -15,6 +16,11 @@

namespace FlexFlow {

enum class CopyDomain {
SRC, // use src instance index space as copy domain (default)
DST, // use dst instance index space as copy domain
};

/**
* @brief An interface that wraps the rest of Realm and protects against certain
* classes of bugs, such as shutdown bugs.
Expand Down Expand Up @@ -63,17 +69,20 @@ struct RealmContext {
int priority = 0);
///\}

/** \name Data movement */
/** \name Data movement and reduction */
///\{
Realm::Event issue_copy(ParallelTensorShape const &src_shape,
Realm::RegionInstance src_inst,
ParallelTensorShape const &dst_shape,
Realm::RegionInstance dst_inst,
Realm::ProfilingRequestSet const &requests,
Realm::Event wait_on = Realm::Event::NO_EVENT,
int priority = 0);
Realm::Event
issue_copy(ParallelTensorShape const &src_shape,
Realm::RegionInstance src_inst,
ParallelTensorShape const &dst_shape,
Realm::RegionInstance dst_inst,
Realm::ProfilingRequestSet const &requests,
Realm::Event wait_on = Realm::Event::NO_EVENT,
int priority = 0,
std::optional<Realm::ReductionOpID> redop_id = std::nullopt,
bool exclusive = false,
CopyDomain domain = CopyDomain::SRC);
///\}

/** \name Instance management */
///\{
std::pair<Realm::RegionInstance, Realm::Event>
Expand All @@ -88,6 +97,50 @@ struct RealmContext {
*/
Realm::Event get_outstanding_events();

/**
* \brief Create a Realm region instance with an offset index space.
*
* Similar to \ref create_instance, but allocates the instance with a
* non-zero origin rect. This is used for sharded tensors where each
* shard occupies a sub-region of the full logical tensor's index space.
*
* For example, given a tensor of shape [10, 16] split along dim 0
* with degree 2:
* - Shard 0 is allocated with rect [0..4, 0..15]
* - Shard 1 is allocated with rect [5..9, 0..15]
*
* This allows plain Realm copies between shards and the combined tensor
* to work correctly — points in each shard's index space match the
* corresponding points in the combined tensor's index space, so Realm
* copies data to the correct region without needing affine indirection.
*
* \param memory The Realm memory in which to allocate the instance.
* \param shape The per-device tensor shape (already divided by degree).
* Determines the size of the instance.
* \param offsets Per-dimension offsets into the full logical tensor.
* \p offsets[i] is the starting index along dimension i.
* For shard k along dim d with piece_size p:
* \p offsets[d] = k * p.
* \param prs Realm profiling request set.
* \param wait_on Event to wait on before creating the instance.
* \return A pair of the created \ref Realm::RegionInstance and a
* \ref Realm::Event that fires when the instance is ready.
*
* \note The instance's index space has origin at \p offsets, not at
* zero. Copies to/from this instance must use its actual index
* space (via \c get_indexspace()) rather than a reconstructed
* zero-based index space.
*
* \see create_instance
* \see perform_instance_allocation_for_value
*/
std::pair<Realm::RegionInstance, Realm::Event> create_instance_with_offset(
Realm::Memory memory,
TensorShape const &shape,
std::vector<int> const &offsets,
Realm::ProfilingRequestSet const &prs,
Realm::Event wait_on = Realm::Event::NO_EVENT);

protected:
/**
* \brief Compact **and clear** the outstanding event queue
Expand Down
Loading