diff --git a/.gitignore b/.gitignore index d4fb281..f572967 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Build artifacts +/build/ +include/infiniccl.h + # Prerequisites *.d diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..8afb785 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,112 @@ +cmake_minimum_required(VERSION 3.18) +project(InfiniCCL LANGUAGES C CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# ========================================================= +# --- DEVICE OPTIONS (Hardware Runtimes) --- +# ========================================================= +option(WITH_NVIDIA "Enable NVIDIA GPU support" OFF) +option(WITH_METAX "Enable MetaX GPU support" OFF) + +set(WITH_CPU ON CACHE INTERNAL "CPU backend is always enabled") + +# ========================================================= +# --- BACKEND OPTIONS (Communication Protocols) --- +# ========================================================= +option(WITH_OMPI "Enable OpenMPI backend" OFF) +option(WITH_NCCL "Enable NCCL backend" OFF) +# OMPI is the default bootstrap/CPU backend +if(NOT WITH_OMPI AND NOT WITH_NCCL) + set(WITH_OMPI ON) + message(STATUS "No backend specified. Defaulting to WITH_OMPI=ON") +endif() + +# ========================================================= +# --- MISC. BUILD OPTIONS --- +# ========================================================= +option(AUTO_DETECT_DEVICES "Automatically detect available devices" ON) + +if(AUTO_DETECT_DEVICES) + message(STATUS "Auto-detecting available devices...") + + # NVIDIA + file(GLOB NVIDIA_DEV_FILES "/dev/nvidia*") + + if(NVIDIA_DEV_FILES) + set(WITH_NVIDIA ON) + message(STATUS "Auto-detected NVIDIA environment.") + endif() + + # MetaX + if(DEFINED ENV{MACA_PATH}) + set(WITH_METAX ON) + message(STATUS "Auto-detected MetaX environment from MACA_PATH") + else() + execute_process( + COMMAND sh -c "grep -h 9999 /sys/bus/pci/devices/*/vendor 2>/dev/null" + OUTPUT_VARIABLE _pci_vendor_output + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + string(FIND "${_pci_vendor_output}" "9999" _found_pos) + + if(_found_pos GREATER -1) + set(WITH_METAX ON) + message(STATUS "Detected MetaX GPU from PCI vendor ID 0x9999") + else() + set(WITH_METAX OFF) + message(STATUS "No MetaX GPU detected") + endif() + endif() +endif() + +# ========================================================= +# --- DEPENDENCY DISCOVERY --- +# ========================================================= +if(WITH_NVIDIA) + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + +if(WITH_METAX) + set(MACA_PATH $ENV{MACA_PATH}) + if(NOT MACA_PATH) + # Normally can be found at: `/opt/maca/`. + set(MACA_PATH "/opt/maca") + message(WARNING "MACA_PATH environment variable not set. Defaulting to /opt/maca/. Please verify this path is correct.") + endif() + + set(CMAKE_C_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mxcc_wrapper.sh) + set(CMAKE_CXX_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mxcc_wrapper.sh) + + include_directories("${MACA_PATH}/include") + link_directories("${MACA_PATH}/lib") + + find_library(MACA_RUNTIME_LIB NAMES mcruntime HINTS "${MACA_PATH}/lib" REQUIRED) +endif() + +if(WITH_OMPI) + find_package(MPI REQUIRED) +endif() + +if(WITH_NCCL) + if (NOT WITH_NVIDIA) + message(FATAL_ERROR "NCCL backend requires NVIDIA GPU support. Please enable WITH_NVIDIA.") + endif() + + find_library(NCCL_LIB NAMES nccl REQUIRED) + find_path(NCCL_INC NAMES nccl.h REQUIRED) + + include_directories(${NCCL_INC}) +endif() + +# Python is required for code generation +find_package(Python3 REQUIRED) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +add_subdirectory(src) +add_subdirectory(examples) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..74a65e2 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,26 @@ +file(GLOB_RECURSE EXAMPLE_SOURCES "*.cc") + +foreach(source_file ${EXAMPLE_SOURCES}) + get_filename_component(example_name ${source_file} NAME_WE) + + add_executable(${example_name} ${source_file}) + + target_link_libraries(${example_name} PRIVATE infiniccl) + + # Add runtime and backend dependencies for direct runtime/backend usage + if(WITH_NVIDIA) + target_link_libraries(${example_name} PRIVATE CUDA::cudart) + endif() + + if(WITH_OMPI) + target_link_libraries(${example_name} PRIVATE MPI::MPI_CXX) + endif() + + # Explicitly allow examples to "peek" into the internal src and binary dirs + # This is necessary because these were marked PRIVATE in the library's CMake + target_include_directories(${example_name} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + "${PROJECT_SOURCE_DIR}/src" # For internal templates like runtime.h + "${CMAKE_BINARY_DIR}/src" # For the generated backend_manifest.h + ) +endforeach() diff --git a/examples/all_reduce.cc b/examples/all_reduce.cc new file mode 100644 index 0000000..743a751 --- /dev/null +++ b/examples/all_reduce.cc @@ -0,0 +1,142 @@ +/** + * InfiniCCL Example: AllReduce + * * This example demonstrates the planned API for performing a + * collective sum-reduction across multiple GPUs and nodes. + */ + +#include +#include +#include + +// Public API +#include "infiniccl.h" + +// Example-specific utilities +#include "utils.h" + +// Internal Headers (Accessible via example-specific include paths, technically +// not public APIs) +#include "backend_manifest.h" +#include "device.h" +#include "runtime.h" +#include "traits.h" + +using namespace infini::ccl; + +void RunAllReduceExample(int argc, char **argv, int warmup_iter, + int profile_iter, const size_t kNumElements) { + constexpr Device::Type kDevType = + ListGetBest(EnabledDevices{}); + + CHECK_INFINI(infiniInit(&argc, &argv)); + + int rank, size; + CHECK_INFINI(infiniGetRank(&rank)); + CHECK_INFINI(infiniGetSize(&size)); + + char hostname[256]; + gethostname(hostname, sizeof(hostname)); + + // Map local rank to GPU device. + // Note: this is just for info printing. In practice, this part is not needed. + const char *local_rank_str = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); + int local_rank = 0; + if (local_rank_str != nullptr) { + local_rank = std::atoi(local_rank_str); + } + + std::cout << "[Rank " << rank << "] Host: " << hostname + << " | GPU: " << Device::StringFromType(kDevType) << " " + << " | Device " << local_rank << std::endl; + + // Setup Communicator + infiniComm_t comm; + CHECK_INFINI(infiniCommInitAll(&comm, size, nullptr)); + + // Prepare Data + std::vector h_send(kNumElements); + std::vector h_recv(kNumElements, 0.0f); + + // Initialize: each rank provides its (rank + 1) as data + for (size_t i = 0; i < kNumElements; i++) { + h_send[i] = static_cast(rank + 1); + } + + float *d_send, *d_recv; + size_t total_bytes = kNumElements * sizeof(*d_send); + Runtime::Malloc(&d_send, total_bytes); + Runtime::Malloc(&d_recv, total_bytes); + Runtime::Memcpy(d_send, h_send.data(), total_bytes, + Runtime::MemcpyHostToDevice); + Runtime::Memcpy(d_recv, h_recv.data(), total_bytes, + Runtime::MemcpyHostToDevice); + + if (rank == 0) { + std::cout << "\n=== Performing AllReduce on GPU Memory ===" << std::endl; + std::cout << "Data size: " << kNumElements << " floats (" + << total_bytes / 1024 / 1024 << " MB)" << std::endl; + std::cout << "Operation: Sum" << std::endl; + std::cout << "Warm-up iterations: " << warmup_iter << std::endl; + std::cout << "Profile iterations: " << profile_iter << std::endl; + } + + Runtime::StreamSynchronize(nullptr); + + // warm-up and D2H transfer the answer + CHECK_INFINI(infiniAllReduce(d_send, d_recv, kNumElements, infiniFloat32, + infiniSum, comm, nullptr)); + Runtime::Memcpy(h_recv.data(), d_recv, kNumElements * sizeof(float), + Runtime::MemcpyDeviceToHost); + + for (int i = 1; i < warmup_iter; ++i) { + CHECK_INFINI(infiniAllReduce(d_send, d_recv, kNumElements, infiniFloat32, + infiniSum, comm, nullptr)); + } + Runtime::StreamSynchronize(nullptr); + + // Profiling + Timer timer; + + for (int i = 0; i < profile_iter; i++) { + CHECK_INFINI(infiniAllReduce(d_send, d_recv, kNumElements, infiniFloat32, + infiniSum, comm, nullptr)); + } + + Runtime::StreamSynchronize(nullptr); + double elapsed = timer.elapsed_ms() / static_cast(profile_iter); + + // Result Validation + float expected = 0.0f; + for (int r = 0; r < size; r++) { + expected += static_cast(r + 1); + } + + Validator::ValidateResult(h_recv.data(), kNumElements, expected, rank); + + // Metrics Reporting (Only from rank 0 for cleaner output) + if (rank == 0) { + Metrics metrics{elapsed, total_bytes, size}; + metrics.Print(); + } + + // Cleanup + Runtime::Free(d_send); + Runtime::Free(d_recv); + + CHECK_INFINI(infiniCommDestroy(comm)); + CHECK_INFINI(infiniFinalize()); + + if (rank == 0) { + std::cout << "InfiniCCL finalized." << std::endl; + } +} + +int main(int argc, char **argv) { + int warmup_iters = 2; + int profile_iters = 20; + size_t num_elements = 1 << 20; + + RunAllReduceExample(argc, argv, warmup_iters, profile_iters, num_elements); + + return EXIT_SUCCESS; +} diff --git a/examples/utils.h b/examples/utils.h new file mode 100644 index 0000000..d8ee1da --- /dev/null +++ b/examples/utils.h @@ -0,0 +1,100 @@ +#ifndef INFINI_CCL_EXAMPLES_UTILS_H_ +#define INFINI_CCL_EXAMPLES_UTILS_H_ + +#include +#include +#include +#include +#include +#include + +// Simple check macro for the C-API +#define CHECK_INFINI(cmd) \ + do { \ + infiniResult_t res = (cmd); \ + if (res != infiniSuccess) { \ + std::cerr << "[InfiniCCL Error] received error code " << res \ + << " at line " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// Simple Timer for profiling +class Timer { + std::chrono::high_resolution_clock::time_point start; + +public: + Timer() : start(std::chrono::high_resolution_clock::now()) {} + double elapsed_ms() const { + auto end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration(end - start).count(); + } +}; + +struct Metrics { + double elapsed_ms; + size_t total_bytes; + int world_size; + + void Print() const { + double seconds = elapsed_ms / 1000.0; + double gigabytes = + static_cast(total_bytes) / (1024.0 * 1024.0 * 1024.0); + + // Industry standard formula: 2 * (n-1) / n + double bus_bw = + (2.0 * (world_size - 1) / world_size) * (gigabytes / seconds); + double alg_bw = gigabytes / seconds; + + std::cout << "Time: " << std::fixed << std::setprecision(3) + << elapsed_ms << " ms" << std::endl; + std::cout << "Throughput: " << std::fixed << std::setprecision(2) + << bus_bw << " GB/s (Bus BW)" << std::endl; + std::cout << "Alg Bandwidth: " << std::fixed << std::setprecision(2) + << alg_bw << " GB/s" << std::endl; + } +}; + +class Validator { +public: + template + static bool ValidateResult(const T *data, size_t count, T expected_val, + int rank) { + bool correct = true; + int error_count = 0; + + for (size_t i = 0; i < count; ++i) { + if (std::fabs(static_cast(data[i]) - + static_cast(expected_val)) > 1e-3) { + correct = false; + error_count++; + if (error_count <= 3 && rank == 0) { + std::cerr << "Error at index " << i << ": " << data[i] + << " != " << expected_val << std::endl; + } + } + } + + if (rank == 0) { + const char *GREEN = "\033[32m"; + const char *RED = "\033[31m"; + const char *RESET = "\033[0m"; + + std::cout << "\n=== AllReduce Results ===" << std::endl; + std::cout << "Correct: " + << (correct ? (GREEN + std::string("YES") + RESET) + : (RED + std::string("NO") + RESET)); + if (!correct) + std::cout << " (" << error_count << " errors)"; + std::cout << std::endl; + + std::cout << "Expect: " << std::fixed << std::setprecision(2) + << static_cast(expected_val) << std::endl; + std::cout << "Actual: " << std::fixed << std::setprecision(2) + << static_cast(data[0]) << std::endl; + } + return correct; + } +}; + +#endif diff --git a/include/comm.h b/include/comm.h new file mode 100644 index 0000000..a778a0f --- /dev/null +++ b/include/comm.h @@ -0,0 +1,48 @@ +#ifndef INFINI_CCL_COMM_H_ +#define INFINI_CCL_COMM_H_ + +#include + +#include "data_type.h" +#include "return_status.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void *infiniComm_t; + +// Initialization +infiniResult_t infiniInit(int *argc, char ***argv); +infiniResult_t infiniFinalize(void); + +// Rank/Size Query +infiniResult_t infiniGetRank(int *rank); +infiniResult_t infiniGetSize(int *size); + +// Communicator Management +infiniResult_t infiniCommInitAll(infiniComm_t *comm, int ndev, + const int *devlist); +infiniResult_t infiniCommDestroy(infiniComm_t comm); + +// --- Reduction Operations --- +typedef enum { + infiniSum = 0, + infiniProd = 1, + infiniMax = 2, + infiniMin = 3, + infiniAvg = 4, + infiniNumOps +} infiniRedOp_t; + +// Collective Communication Functions +infiniResult_t infiniAllReduce(const void *sendbuff, void *recvbuff, + size_t count, infiniDataType_t datatype, + infiniRedOp_t op, infiniComm_t comm, + void *stream); + +#ifdef __cplusplus +} +#endif + +#endif // INFINI_CCL_COMM_H_ diff --git a/include/data_type.h b/include/data_type.h new file mode 100644 index 0000000..f39c409 --- /dev/null +++ b/include/data_type.h @@ -0,0 +1,33 @@ +#ifndef INFINI_CCL_DATA_TYPE_H_ +#define INFINI_CCL_DATA_TYPE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + infiniChar = 0, + infiniInt8 = 0, + infiniInt16 = 1, + infiniInt = 2, + infiniInt32 = 2, + infiniInt64 = 3, + infiniUInt8 = 4, + infiniUInt16 = 5, + infiniUInt32 = 6, + infiniUInt64 = 7, + infiniHalf = 8, + infiniFloat16 = 8, + infiniBFloat16 = 9, + infiniFloat = 10, + infiniFloat32 = 10, + infiniDouble = 11, + infiniFloat64 = 11, + infiniNumTypes = 12, +} infiniDataType_t; + +#ifdef __cplusplus +} +#endif + +#endif // INFINI_CCL_DATA_TYPE_H_ diff --git a/include/return_status.h b/include/return_status.h new file mode 100644 index 0000000..4e6bf6b --- /dev/null +++ b/include/return_status.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_RETURN_STATUS_H_ +#define INFINI_CCL_RETURN_STATUS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + infiniSuccess = 0, + infiniUnhandledError = 1, + infiniSystemError = 2, + infiniInternalError = 3, + infiniInvalidArgument = 4, + infiniInvalidUsage = 5, + infiniRemoteError = 6, + infiniInProgress = 7, + infiniNumResults = 8 +} infiniResult_t; + +#ifdef __cplusplus +} +#endif + +#endif // INFINI_CCL_RETURN_STATUS_H_ diff --git a/scripts/gen_bridge.py b/scripts/gen_bridge.py new file mode 100644 index 0000000..066d3c3 --- /dev/null +++ b/scripts/gen_bridge.py @@ -0,0 +1,188 @@ +import os +import re +import sys + +# ================================================================= +# CONFIGURATION +# ================================================================= + +# The standardized warning header for generated files +AUTOGEN_HEADER = """/* + * ----------------------------------------------------------------- + * AUTO-GENERATED FILE — DO NOT EDIT. + * ----------------------------------------------------------------- + * This file is generated during the build process. + * Any changes made here may be overwritten during the build or code + * generation process. + * ----------------------------------------------------------------- + */ +""" + +# Hardware traits to look for in device directories +DEVICE_TRAIT_HEADERS = [ + "device_.h", + "runtime_.h", + "data_type_.h" +] + +# Map logical backend names (from CMake) to their internal source paths +BACKEND_PATH_MAP = { + "ompi": "ompi/impl", + "nccl": "nvidia/nccl" +} + +# ================================================================= +# LOGIC +# ================================================================= + +def snake_to_camel(name): + """Converts infini_all_reduce or infiniAllReduce to AllReduce.""" + clean_name = re.sub(r'^infini', '', name) + if '_' in clean_name: + return "".join(x.capitalize() for x in clean_name.split('_')) + return clean_name[0].upper() + clean_name[1:] + +def parse_signatures(header_path): + """Extracts function metadata from the public C header.""" + signatures = [] + regex = r"(infiniResult_t)\s+(infini\w+)\s*\((.*?)\);" + if not os.path.exists(header_path): + return signatures + + with open(header_path, 'r') as f: + content = f.read() + for m in re.finditer(regex, content, re.DOTALL): + ret_type, func_name, params_raw = m.group(1), m.group(2), m.group(3).strip() + + param_names = [] + if params_raw and params_raw != "void": + for p in params_raw.split(','): + # Extract the variable name (last word before comma/bracket) + param_names.append(p.strip().split()[-1].replace('*', '')) + + signatures.append({ + 'ret': ret_type, + 'name': func_name, + 'params': params_raw, + 'args': ", ".join(param_names), + 'key': snake_to_camel(func_name) + }) + return signatures + +def generate(project_root, output_dir, devices, backends): + src_dir = os.path.join(project_root, "src") + base_dir = os.path.join(src_dir, "base") + header_path = os.path.join(project_root, "include/comm.h") + + # Get the list of all operations defined in the base directory + ops_in_base = [f[:-2] for f in os.listdir(base_dir) if f.endswith('.h')] if os.path.exists(base_dir) else [] + sigs = parse_signatures(header_path) + + # 1. Generate backend_manifest.h + manifest_lines = [ + AUTOGEN_HEADER, + "#ifndef INFINI_CCL_BACKEND_MANIFEST_H_", + "#define INFINI_CCL_BACKEND_MANIFEST_H_" + ] + + found_devices = [] + + # Process Active Devices + for dev in devices: + if not dev: continue + + device_included = False + manifest_lines.append(f"\n// --- DEVICE: {dev.upper()} ---") + + for trait in DEVICE_TRAIT_HEADERS: + rel_path = f"{dev}/{trait}" + if os.path.exists(os.path.join(src_dir, rel_path)): + manifest_lines.append(f"#include \"{rel_path}\"") + device_included = True + + if device_included: + found_devices.append(f"Device::Type::k{dev.capitalize()}") + + # Process Active Backends + for bb in backends: + if not bb or bb not in BACKEND_PATH_MAP: continue + + manifest_lines.append(f"\n// --- BACKEND: {bb.upper()} ---") + impl_subpath = BACKEND_PATH_MAP[bb] + + for op in ops_in_base: + rel_path = f"{impl_subpath}/{op}.h" + if os.path.exists(os.path.join(src_dir, rel_path)): + manifest_lines.append(f"#include \"{rel_path}\"") + + # Add the Type Alias `EnabledDevices` + manifest_lines.append("\nnamespace infini::ccl {\n") + if found_devices: + dev_list_str = ", ".join(found_devices) + manifest_lines.append(f"using EnabledDevices = List<{dev_list_str}>;") + else: + manifest_lines.append("using EnabledDevices = List<>;") + manifest_lines.append("\n} // namespace infini::ccl") + + manifest_lines.append("\n#endif // INFINI_CCL_BACKEND_MANIFEST_H_\n") + + with open(os.path.join(output_dir, "backend_manifest.h"), 'w') as f: + f.write("\n".join(manifest_lines)) + + # 2. Generate comm_bridge.cc + bridge_lines = [ + AUTOGEN_HEADER, + '#include "backend_manifest.h"', + '#include "comm.h"', + '#include "comm_impl.h"', + '#include "data_type_impl.h"', + '#include "operation.h"', + '\nnamespace infini::ccl {', + '\nextern "C" {' + ] + + for s in sigs: + # We need to transform the raw args to add casts for specific types + args_with_casts = [] + # Split params to analyze types (simplified approach) + params_list = s['params'].split(',') + + for p in params_list: + p = p.strip() + if not p or p == "void": continue + + # Get the type and the name + parts = p.split() + arg_type = parts[0] + arg_name = parts[-1].replace('*', '') + + # Apply static_cast for specialized Infini types + if arg_type == "infiniDataType_t": + args_with_casts.append(f"static_cast({arg_name})") + elif arg_type == "infiniRedOp_t": + args_with_casts.append(f"static_cast({arg_name})") + else: + args_with_casts.append(arg_name) + + casted_args_str = ", ".join(args_with_casts) + + bridge_lines.append( + f"\n{s['ret']} {s['name']}({s['params']}) {{\n" + f" return static_cast<{s['ret']}>(Operation<{s['key']}>::Call({casted_args_str}));\n" + f"}}" + ) + + bridge_lines.append('\n} // extern "C"') + bridge_lines.append('} // namespace infini::ccl\n') + + with open(os.path.join(output_dir, "comm_bridge.cc"), 'w') as f: + f.write("\n".join(bridge_lines)) + +if __name__ == "__main__": + # Path/List arguments passed from CMake + root = sys.argv[1] + out = sys.argv[2] + devs = sys.argv[3].split(';') if len(sys.argv) > 3 else [] + backs = sys.argv[4].split(';') if len(sys.argv) > 4 else [] + + generate(root, out, devs, backs) diff --git a/scripts/mxcc_wrapper.sh b/scripts/mxcc_wrapper.sh new file mode 100644 index 0000000..0010617 --- /dev/null +++ b/scripts/mxcc_wrapper.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Filter out flags unsupported by `mxcc`. +ARGS=() +skip_next=0 +for arg in "$@"; do + if [ $skip_next -eq 1 ]; then + skip_next=0 + continue + fi + case "$arg" in + -pthread) + ;; + -B) + skip_next=1 + ;; + -B*) + ;; + *) + ARGS+=("$arg") + ;; + esac +done + +exec ${MACA_PATH}/mxgpu_llvm/bin/mxcc "${ARGS[@]}" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..3fe8c6b --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,163 @@ +add_library(infiniccl SHARED) + +set(DEVICE_LIST "cpu") +set(BACKEND_LIST "") +set(AUTOGEN_WARNING [[/* + * ----------------------------------------------------------------- + * AUTO-GENERATED FILE — DO NOT EDIT. + * ----------------------------------------------------------------- + * This file is generated during the build process. + * Any changes made here may be overwritten during the build or code + * generation process. + * ----------------------------------------------------------------- + */ + +]]) + +file(GLOB CORE_SRCS "*.cc") +file(GLOB_RECURSE BASE_IMPL_SRCS "base/*.cc") + +target_sources(infiniccl PRIVATE + ${CORE_SRCS} + ${BASE_IMPL_SRCS} +) + +# ========================================================= +# --- DEVICE SECTIONS (Hardware Runtimes) --- +# ========================================================= + +# CPU (Always On as the base architecture) +file(GLOB_RECURSE CPU_SRCS "cpu/*.cc") +target_sources(infiniccl PRIVATE ${CPU_SRCS}) + +# NVIDIA +if(WITH_NVIDIA) + list(APPEND DEVICE_LIST "nvidia") + + set(NVIDIA_PATTERNS + "cuda/*.cc" + "cuda/*.cpp" + "cuda/*.cu" + "nvidia/*.cc" + "nvidia/*.cpp" + "nvidia/*.cu" + ) + file(GLOB NVIDIA_SOURCES ${NVIDIA_PATTERNS}) + + target_sources(infiniccl PRIVATE ${NVIDIA_SOURCES}) + target_link_libraries(infiniccl PRIVATE CUDA::cudart CUDA::cuda_driver) + + set_target_properties(infiniccl PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) +endif() + +# MetaX +if(WITH_METAX) + list(APPEND DEVICE_LIST "metax") + + set(METAX_PATTERNS + "cuda/*.cc" + "cuda/*.cpp" + "metax/*.cc" + "metax/*.cpp" + "metax/*.maca" + ) + file(GLOB METAX_SOURCES ${METAX_PATTERNS}) + + set_source_files_properties(${METAX_SOURCES} PROPERTIES LANGUAGE CXX) + + target_sources(infiniccl PRIVATE ${METAX_SOURCES}) + target_include_directories(infiniccl PRIVATE "${MACA_PATH}/include") + target_link_libraries(infiniccl PRIVATE ${MACA_RUNTIME_LIB}) + target_compile_options(infiniccl PRIVATE $<$:-x maca>) +endif() + +# ========================================================= +# --- BACKEND SECTIONS (Communication Protocols) --- +# ========================================================= + +# OpenMPI +if(WITH_OMPI) + list(APPEND BACKEND_LIST "ompi") + file(GLOB_RECURSE OMPI_SRCS "ompi/*.cc" "ompi/*.cpp") + target_sources(infiniccl PRIVATE ${OMPI_SRCS}) + + if(TARGET MPI::MPI_CXX) + target_link_libraries(infiniccl PRIVATE MPI::MPI_CXX) + endif() +endif() + +# NCCL +if(WITH_NCCL) + list(APPEND BACKEND_LIST "nccl") + + if (WITH_NVIDIA) + file(GLOB_RECURSE NCCL_SRCS "nvidia/nccl/*.cc" "nvidia/nccl/*.cpp" "nvidia/nccl/*.cu") + endif() + + target_sources(infiniccl PRIVATE ${NCCL_SRCS}) + target_include_directories(infiniccl PRIVATE ${NCCL_INC}) + target_link_libraries(infiniccl PRIVATE ${NCCL_LIB}) +endif() + +# ========================================================= +# --- File Generation --- +# ========================================================= +# --- Generate umbrella header: include/infiniccl.h --- +set(PUBLIC_HEADERS_DIR "${PROJECT_SOURCE_DIR}/include") +file(GLOB PUBLIC_HEADERS "${PUBLIC_HEADERS_DIR}/*.h") + +set(INFINICCL_H_CONTENT "${AUTOGEN_WARNING}#ifndef INFINI_CCL_H_\n#define INFINI_CCL_H_\n\n") + +foreach(HEADER_PATH ${PUBLIC_HEADERS}) + get_filename_component(HEADER_NAME ${HEADER_PATH} NAME) + if(NOT "${HEADER_NAME}" STREQUAL "infiniccl.h") + set(INFINICCL_H_CONTENT "${INFINICCL_H_CONTENT}#include \"${HEADER_NAME}\"\n") + endif() +endforeach() + +set(INFINICCL_H_CONTENT "${INFINICCL_H_CONTENT}\n#endif // INFINI_CCL_H_\n") + +file(WRITE "${PUBLIC_HEADERS_DIR}/infiniccl.h" "${INFINICCL_H_CONTENT}") + +# --- Generate the bridge files --- +set(GENERATED_BRIDGE "${CMAKE_CURRENT_BINARY_DIR}/comm_bridge.cc") +set(GENERATED_MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/backend_manifest.h") +set(DEV_STR "${DEVICE_LIST}") +set(BACK_STR "${BACKEND_LIST}") + +add_custom_command( + OUTPUT "${GENERATED_BRIDGE}" "${GENERATED_MANIFEST}" + COMMAND ${Python3_EXECUTABLE} "${PROJECT_SOURCE_DIR}/scripts/gen_bridge.py" + "${PROJECT_SOURCE_DIR}" + "${CMAKE_CURRENT_BINARY_DIR}" + "${DEV_STR}" + "${BACK_STR}" + DEPENDS "${PROJECT_SOURCE_DIR}/scripts/gen_bridge.py" + "${PROJECT_SOURCE_DIR}/include/comm.h" + ${BASE_IMPL_SRCS} + VERBATIM + COMMENT "Generating InfiniCCL bridge and manifest files for Devices: [${DEV_STR}] Backends: [${BACK_STR}]..." +) + +target_sources(infiniccl PRIVATE "${GENERATED_BRIDGE}") +target_include_directories(infiniccl PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") + +# ========================================================= +# --- Interface and Export --- +# ========================================================= + +target_include_directories(infiniccl + PUBLIC + $ + $ + PRIVATE + $ + $ +) + +string(REPLACE ";" ", " PRINT_DEVS "${DEVICE_LIST}") +string(REPLACE ";" ", " PRINT_BACKS "${BACKEND_LIST}") +message(STATUS "InfiniCCL Config: Devices [${PRINT_DEVS}] | Backends [${PRINT_BACKS}]") diff --git a/src/backend.h b/src/backend.h new file mode 100644 index 0000000..3bb0456 --- /dev/null +++ b/src/backend.h @@ -0,0 +1,58 @@ +#ifndef INFINI_CCL_BACKEND_H_ +#define INFINI_CCL_BACKEND_H_ + +#include + +#include "traits.h" + +namespace infini::ccl { + +enum class BackendType : int8_t { + kOmpi = 0, + kGloo = 1, + kNccl = 2, + kMccl = 3, + kRccl = 4, + kCncl = 5, + kHccl = 6, + kCount +}; + +using AllBackendTypes = + List; + +template struct BackendEnabled : std::false_type {}; + +// Deferred computation of active backends for a specific operation Key. +template struct ActiveBackendsImpl { + struct Filter { + template + std::enable_if_t< + BackendEnabled(kBackend)>::value> + operator()(ValueTag) const {} + }; + + using type = typename FilterList, AllBackendTypes>::type; +}; + +template +using ActiveBackends = typename ActiveBackendsImpl::type; + +// Priority trait for backend selection. +template struct BackendPriority { + static constexpr int value = 0; +}; + +template <> struct BackendPriority { + static constexpr int value = 1; +}; + +template <> struct BackendPriority { + static constexpr int value = 10; +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_BACKEND_H_ diff --git a/src/base/all_reduce.h b/src/base/all_reduce.h new file mode 100644 index 0000000..dfbbb72 --- /dev/null +++ b/src/base/all_reduce.h @@ -0,0 +1,58 @@ +#ifndef INFINI_CCL_BASE_ALL_REDUCE_H_ +#define INFINI_CCL_BASE_ALL_REDUCE_H_ + +#include "comm_impl.h" +#include "communicator.h" +#include "logging.h" +#include "operation.h" +#include "return_status_impl.h" + +namespace infini::ccl { + +template +struct AllReduceImpl; + +class AllReduce : public Operation { +public: + template + static ReturnStatus Execute(const void *send_buff, void *recv_buff, + size_t count, DataType datatype, + ReductionOpType op, void *comm_handle, + void *stream) { + if (HasInvalidArgs(send_buff, recv_buff, datatype, op, comm_handle)) { + return ReturnStatus::kInvalidArgument; + } + auto *comm = static_cast(comm_handle); + return AllReduceImpl::Apply( + send_buff, recv_buff, count, datatype, op, comm, stream); + } + +private: + static bool HasInvalidArgs(const void *send_buff, void *recv_buff, + DataType datatype, ReductionOpType op, + void *comm_handle) { + if (!comm_handle) { + // TODO(lzm): change to use `glog`. + LOG("Invalid communicator handle for AllReduce."); + return true; + } + if (!send_buff || !recv_buff) { + LOG("Invalid buffer pointer for AllReduce."); + return true; + } + if (op < ReductionOpType::kSum || op >= ReductionOpType::kNumRedOps) { + LOG("Invalid reduction operation for AllReduce."); + return true; + } + if (datatype < DataType::kChar || datatype >= DataType::kNumTypes) { + LOG("Invalid data type for AllReduce."); + return true; + } + return false; + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_BASE_ALL_REDUCE_H_ diff --git a/src/base/comm_destroy.h b/src/base/comm_destroy.h new file mode 100644 index 0000000..d4c0d8e --- /dev/null +++ b/src/base/comm_destroy.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_BASE_COMM_DESTROY_H_ +#define INFINI_CCL_BASE_COMM_DESTROY_H_ + +#include "operation.h" +#include "return_status_impl.h" + +namespace infini::ccl { + +template +struct CommDestroyImpl; + +class CommDestroy : public Operation { +public: + template + static ReturnStatus Execute(Args &&...args) { + return CommDestroyImpl::Apply( + std::forward(args)...); + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_BASE_COMM_DESTROY_H_ diff --git a/src/base/comm_init_all.h b/src/base/comm_init_all.h new file mode 100644 index 0000000..8ddb750 --- /dev/null +++ b/src/base/comm_init_all.h @@ -0,0 +1,37 @@ +#ifndef INFINI_CCL_BASE_COMM_INIT_ALL_H_ +#define INFINI_CCL_BASE_COMM_INIT_ALL_H_ + +#include "logging.h" +#include "operation.h" +#include "return_status_impl.h" + +namespace infini::ccl { + +template +struct CommInitAllImpl; + +class CommInitAll : public Operation { +public: + template + static ReturnStatus Execute(void **comm_handle, Args &&...args) { + Communicator *&comm = *reinterpret_cast(comm_handle); + if (comm) { + // TODO(lzm): change to use `glog`. + LOG("Invalid communicator handle for CommInitAll."); + return ReturnStatus::kInvalidArgument; + } + + constexpr Device::Type kDev = + ListGetBest(ActiveDevices{}); + + comm = new Communicator(kDev, 0); + + return CommInitAllImpl::Apply( + comm, std::forward(args)...); + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_BASE_COMM_INIT_ALL_H_ diff --git a/src/base/finalize.h b/src/base/finalize.h new file mode 100644 index 0000000..e2b5e98 --- /dev/null +++ b/src/base/finalize.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_BASE_FINALIZE_H_ +#define INFINI_CCL_BASE_FINALIZE_H_ + +#include "operation.h" +#include "return_status_impl.h" + +namespace infini::ccl { + +template +struct FinalizeImpl; + +class Finalize : public Operation { +public: + template + static ReturnStatus Execute(Args &&...args) { + return FinalizeImpl::Apply( + std::forward(args)...); + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_BASE_FINALIZE_H_ diff --git a/src/base/get_rank.h b/src/base/get_rank.h new file mode 100644 index 0000000..c68fa28 --- /dev/null +++ b/src/base/get_rank.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_BASE_GET_RANK_H_ +#define INFINI_CCL_BASE_GET_RANK_H_ + +#include "operation.h" +#include "return_status_impl.h" + +namespace infini::ccl { + +template +struct GetRankImpl; + +class GetRank : public Operation { +public: + template + static ReturnStatus Execute(Args &&...args) { + return GetRankImpl::Apply( + std::forward(args)...); + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_BASE_GET_RANK_H_ diff --git a/src/base/get_size.h b/src/base/get_size.h new file mode 100644 index 0000000..5a7925c --- /dev/null +++ b/src/base/get_size.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_BASE_GET_SIZE_H_ +#define INFINI_CCL_BASE_GET_SIZE_H_ + +#include "operation.h" +#include "return_status_impl.h" + +namespace infini::ccl { + +template +struct GetSizeImpl; + +class GetSize : public Operation { +public: + template + static ReturnStatus Execute(Args &&...args) { + return GetSizeImpl::Apply( + std::forward(args)...); + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_BASE_GET_SIZE_H_ diff --git a/src/base/init.h b/src/base/init.h new file mode 100644 index 0000000..b0d507b --- /dev/null +++ b/src/base/init.h @@ -0,0 +1,23 @@ +#ifndef INFINI_CCL_BASE_INIT_H_ +#define INFINI_CCL_BASE_INIT_H_ + +#include "operation.h" +#include "return_status_impl.h" + +namespace infini::ccl { + +template struct InitImpl; + +class Init : public Operation { +public: + template + static ReturnStatus Execute(Args &&...args) { + return InitImpl::Apply( + std::forward(args)...); + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_BASE_INIT_H_ diff --git a/src/comm_impl.h b/src/comm_impl.h new file mode 100644 index 0000000..b8c4fa3 --- /dev/null +++ b/src/comm_impl.h @@ -0,0 +1,21 @@ +#ifndef INFINI_CCL_COMM_IMPL_H_ +#define INFINI_CCL_COMM_IMPL_H_ + +#include + +#include "comm.h" + +namespace infini::ccl { + +enum class ReductionOpType : int8_t { + kSum = infiniSum, + kProd = infiniProd, + kMax = infiniMax, + kMin = infiniMin, + kAvg = infiniAvg, + kNumRedOps = infiniNumOps, +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_COMM_IMPL_H_ diff --git a/src/communicator.h b/src/communicator.h new file mode 100644 index 0000000..e4ab14d --- /dev/null +++ b/src/communicator.h @@ -0,0 +1,79 @@ +#ifndef INFINI_CCL_COMMUNICATOR_H_ +#define INFINI_CCL_COMMUNICATOR_H_ + +#include + +#include "backend.h" +#include "device.h" + +namespace infini::ccl { + +struct BackendCommInstance { + virtual ~BackendCommInstance() = default; + BackendType type; +}; + +class Communicator { +public: + Communicator(Device::Type device_type, int device_id) + : device_type_(device_type), device_id_(device_id), global_rank_(-1), + global_size_(0) {} + + void set_world_info(int rank, int size) { + global_rank_ = rank; + global_size_ = size; + } + + auto intra_comm() const { return intra_comm_.get(); } + + auto inter_comm() const { return inter_comm_.get(); } + + void set_intra_comm(std::unique_ptr inst) { + intra_comm_ = std::move(inst); + } + + void set_inter_comm(std::unique_ptr inst) { + inter_comm_ = std::move(inst); + } + + BackendType intra_comm_backend() const { + return intra_comm_ ? intra_comm_->type : BackendType::kCount; + } + + BackendType inter_comm_backend() const { + return inter_comm_ ? inter_comm_->type : BackendType::kCount; + } + + int rank() const { return global_rank_; } + + int size() const { return global_size_; } + + int device_id() const { return device_id_; } + + void set_device_id(int id) { device_id_ = id; } + + Device::Type device_type() const { return device_type_; } + + bool HasBackend(BackendType t) const { + return (intra_comm_backend() == t) || (inter_comm_backend() == t); + } + +private: + // Slot 1: Intra-node (e.g., NCCL) + std::unique_ptr intra_comm_; + + // Slot 2: Inter-node (e.g., OpenMPI) + std::unique_ptr inter_comm_; + + int device_id_; + + int global_rank_; + + int global_size_; + + Device::Type device_type_; +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_COMMUNICATOR_H_ diff --git a/src/constexpr_map.h b/src/constexpr_map.h new file mode 100644 index 0000000..843ea97 --- /dev/null +++ b/src/constexpr_map.h @@ -0,0 +1,32 @@ +#ifndef INFINI_CCL_CONSTEXPR_MAP_H_ +#define INFINI_CCL_CONSTEXPR_MAP_H_ + +#include +#include +#include +#include + +namespace infini::ccl { + +template struct ConstexprMap { + constexpr ConstexprMap(std::array, size> data) + : data_(data) {} + + constexpr Value at(Key key) const { + for (const auto &pr : data_) { + if (pr.first == key) + return pr.second; + } + // TODO(lzm): change to use `glog`. + assert("the key is not found in the `ConstexprMap`"); + // Unreachable, provided to satisfy the compiler's requirement. + std::abort(); + } + +private: + std::array, size> data_; +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_CONSTEXPR_MAP_H_ diff --git a/src/cpu/data_type_.h b/src/cpu/data_type_.h new file mode 100644 index 0000000..8f24ad9 --- /dev/null +++ b/src/cpu/data_type_.h @@ -0,0 +1,99 @@ +#ifndef INFINI_CCL_CPU_DATA_TYPE__H_ +#define INFINI_CCL_CPU_DATA_TYPE__H_ + +#include "cpu/device_.h" +#include "data_type_impl.h" + +namespace infini::ccl { + +struct Float16 { + std::uint16_t bits; + + static inline Float16 FromFloat(float val) { + std::uint32_t f32; + std::memcpy(&f32, &val, sizeof(f32)); + std::uint16_t sign = (f32 >> 16) & 0x8000; + std::int32_t exponent = ((f32 >> 23) & 0xFF) - 127; + std::uint32_t mantissa = f32 & 0x7FFFFF; + + if (exponent >= 16) { + // NaN + if (exponent == 128 && mantissa != 0) { + return {static_cast(sign | 0x7E00)}; + } + // Inf + return {static_cast(sign | 0x7C00)}; + } else if (exponent >= -14) { + return {static_cast(sign | ((exponent + 15) << 10) | + (mantissa >> 13))}; + } else if (exponent >= -24) { + mantissa |= 0x800000; + mantissa >>= (-14 - exponent); + return {static_cast(sign | (mantissa >> 13))}; + } + // Too small for subnormal: return signed zero. + return {sign}; + } + + inline float ToFloat() const { + std::uint32_t sign = (bits & 0x8000) << 16; + std::int32_t exponent = (bits >> 10) & 0x1F; + std::uint32_t mantissa = bits & 0x3FF; + std::uint32_t f32_bits; + + if (exponent == 31) { + f32_bits = sign | 0x7F800000 | (mantissa << 13); + } else if (exponent == 0) { + if (mantissa == 0) { + f32_bits = sign; + } else { + exponent = -14; + while ((mantissa & 0x400) == 0) { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF; + f32_bits = sign | ((exponent + 127) << 23) | (mantissa << 13); + } + } else { + f32_bits = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + } + + float result; + std::memcpy(&result, &f32_bits, sizeof(result)); + return result; + } +}; + +struct BFloat16 { + std::uint16_t bits; + + static inline BFloat16 FromFloat(float val) { + std::uint32_t bits32; + std::memcpy(&bits32, &val, sizeof(bits32)); + + const std::uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); + std::uint16_t bf16_bits = + static_cast((bits32 + rounding_bias) >> 16); + return {bf16_bits}; + } + + inline float ToFloat() const { + std::uint32_t bits32 = static_cast(bits) << 16; + float result; + std::memcpy(&result, &bits32, sizeof(result)); + return result; + } +}; + +template <> struct TypeMap { + using type = Float16; +}; + +template <> struct TypeMap { + using type = BFloat16; +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_CPU_DATA_TYPE__H_ diff --git a/src/cpu/device_.h b/src/cpu/device_.h new file mode 100644 index 0000000..e00578d --- /dev/null +++ b/src/cpu/device_.h @@ -0,0 +1,12 @@ +#ifndef INFINI_CCL_CPU_DEVICE__H_ +#define INFINI_CCL_CPU_DEVICE__H_ + +#include "device.h" + +namespace infini::ccl { + +template <> struct DeviceEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif diff --git a/src/cpu/runtime_.h b/src/cpu/runtime_.h new file mode 100644 index 0000000..95f3643 --- /dev/null +++ b/src/cpu/runtime_.h @@ -0,0 +1,34 @@ +#ifndef INFINI_CCL_CPU_RUNTIME__H_ +#define INFINI_CCL_CPU_RUNTIME__H_ + +#include +#include + +#include "runtime.h" + +namespace infini::ccl { + +template <> +struct Runtime : RuntimeBase> { + static constexpr Device::Type kDeviceType = Device::Type::kCpu; + + static void Malloc(void **ptr, std::size_t size) { *ptr = std::malloc(size); } + + static void Free(void *ptr) { std::free(ptr); } + + static void Memcpy(void *dst, const void *src, std::size_t size, int) { + std::memcpy(dst, src, size); + } + + static constexpr auto Memset = std::memset; + + static constexpr int MemcpyHostToDevice = 0; + + static constexpr int MemcpyDeviceToHost = 1; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::ccl + +#endif // INFINI_CCL_CPU_RUNTIME_H_ diff --git a/src/cuda/runtime_.h b/src/cuda/runtime_.h new file mode 100644 index 0000000..551f807 --- /dev/null +++ b/src/cuda/runtime_.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_CUDA_RUNTIME_H_ +#define INFINI_CCL_CUDA_RUNTIME_H_ + +#include + +#include "runtime.h" + +namespace infini::ccl { + +template struct CudaRuntime : DeviceRuntime { + static constexpr bool Validate() { + DeviceRuntime::Validate(); + static_assert( + std::is_invocable_v, + "`Runtime::Memcpy` must be callable with " + "`(void*, const void*, size_t, MemcpyHostToDevice)`."); + return true; + } +}; + +} // namespace infini::ccl + +#endif diff --git a/src/data_type_impl.h b/src/data_type_impl.h new file mode 100644 index 0000000..333e185 --- /dev/null +++ b/src/data_type_impl.h @@ -0,0 +1,142 @@ +#ifndef INFINI_CCL_DATA_TYPE_IMPL_H_ +#define INFINI_CCL_DATA_TYPE_IMPL_H_ + +#include +#include +#include + +#include "constexpr_map.h" +#include "data_type.h" + +namespace infini::ccl { + +enum class DataType : int8_t { + kChar = infiniChar, + kInt8 = infiniInt8, + kInt16 = infiniInt16, + kInt32 = infiniInt32, + kInt64 = infiniInt64, + kUInt8 = infiniUInt8, + kUInt16 = infiniUInt16, + kUInt32 = infiniUInt32, + kUInt64 = infiniUInt64, + kFloat16 = infiniFloat16, + kBFloat16 = infiniBFloat16, + kFloat32 = infiniFloat32, + kFloat64 = infiniFloat64, + kNumTypes = infiniNumTypes, +}; + +constexpr ConstexprMap kDataTypeToSize{{{ + {DataType::kInt8, 1}, + {DataType::kInt16, 2}, + {DataType::kInt32, 4}, + {DataType::kInt64, 8}, + {DataType::kUInt8, 1}, + {DataType::kUInt16, 2}, + {DataType::kUInt32, 4}, + {DataType::kUInt64, 8}, + {DataType::kFloat16, 2}, + {DataType::kBFloat16, 2}, + {DataType::kFloat32, 4}, + {DataType::kFloat64, 8}, +}}}; + +constexpr ConstexprMap kDataTypeToDesc{{{ + {DataType::kInt8, "int8"}, + {DataType::kInt16, "int16"}, + {DataType::kInt32, "int32"}, + {DataType::kInt64, "int64"}, + {DataType::kUInt8, "uint8"}, + {DataType::kUInt16, "uint16"}, + {DataType::kUInt32, "uint32"}, + {DataType::kUInt64, "uint64"}, + {DataType::kFloat16, "float16"}, + {DataType::kBFloat16, "bfloat16"}, + {DataType::kFloat32, "float32"}, + {DataType::kFloat64, "float64"}, +}}}; + +constexpr ConstexprMap kStringToDataType{{{ + {"int8", DataType::kInt8}, + {"int16", DataType::kInt16}, + {"int32", DataType::kInt32}, + {"int64", DataType::kInt64}, + {"uint8", DataType::kUInt8}, + {"uint16", DataType::kUInt16}, + {"uint32", DataType::kUInt32}, + {"uint64", DataType::kUInt64}, + {"float16", DataType::kFloat16}, + {"bfloat16", DataType::kBFloat16}, + {"float32", DataType::kFloat32}, + {"float64", DataType::kFloat64}, +}}}; + +// `TypeMap` maps a `DataType` on a given `Device::Type` to the corresponding +// C++ type. +template struct TypeMap; + +template +using TypeMapType = typename TypeMap::type; + +// `DataTypeMap` maps a C++ type to the corresponding `DataType`. +template struct DataTypeMap; + +template +inline constexpr DataType DataTypeMapValue = DataTypeMap::value; + +#define DEFINE_DATA_TYPE_MAPPING(ENUM_VALUE, CPP_TYPE) \ + template struct TypeMap { \ + using type = CPP_TYPE; \ + }; \ + \ + template <> struct DataTypeMap { \ + static constexpr DataType value = DataType::ENUM_VALUE; \ + }; + +DEFINE_DATA_TYPE_MAPPING(kUInt8, std::uint8_t) +DEFINE_DATA_TYPE_MAPPING(kInt8, std::int8_t) +DEFINE_DATA_TYPE_MAPPING(kUInt16, std::uint16_t) +DEFINE_DATA_TYPE_MAPPING(kInt16, std::int16_t) +DEFINE_DATA_TYPE_MAPPING(kUInt32, std::uint32_t) +DEFINE_DATA_TYPE_MAPPING(kInt32, std::int32_t) +DEFINE_DATA_TYPE_MAPPING(kUInt64, std::uint64_t) +DEFINE_DATA_TYPE_MAPPING(kInt64, std::int64_t) +DEFINE_DATA_TYPE_MAPPING(kFloat32, float) +DEFINE_DATA_TYPE_MAPPING(kFloat64, double) +#undef DEFINE_DATA_TYPE_MAPPING + +// Checks whether a C++ type is the bfloat16 or float16 type for the given +// device. Full specializations for each device's float16/bfloat16 types are +// provided in the corresponding platform-specific device type headers. +template +inline constexpr bool IsBFloat16 = + std::is_same_v>; + +template +inline constexpr bool IsFP16 = + std::is_same_v>; + +// Defines the common categories of data types using List. +using FloatTypes = List; +using ReducedFloatTypes = List; +using IntTypes = + List; +using UIntTypes = List; + +using BitTypes8 = List; +using BitTypes16 = List; +using BitTypes32 = + List; +using BitTypes64 = + List; + +using AllFloatTypes = ConcatType; +using AllIntTypes = ConcatType; +using AllTypes = ConcatType; + +} // namespace infini::ccl + +#endif // INFINI_CCL_DATA_TYPE_IMPL_H_ diff --git a/src/device.h b/src/device.h new file mode 100644 index 0000000..ec47794 --- /dev/null +++ b/src/device.h @@ -0,0 +1,138 @@ +#ifndef INFINI_CCL_DEVICE_H_ +#define INFINI_CCL_DEVICE_H_ + +#include + +#include "constexpr_map.h" +#include "traits.h" + +namespace infini::ccl { + +class Device { +public: + enum class Type { + kCpu = 0, + kNvidia = 1, + kCambricon = 2, + kAscend = 3, + kMetax = 4, + kMoore = 5, + kIluvatar = 6, + kKunlun = 7, + kHygon = 8, + kQy = 9, + kCount + }; + + Device() = default; + + Device(const Type &type, const int &index = 0) : type_{type}, index_{index} {} + + static const Type TypeFromString(const std::string &name) { + return kDescToDevice.at(name); + } + + static const std::string_view StringFromType(const Type &type) { + return kDeviceToDesc.at(type); + } + + const Type &type() const { return type_; } + + const int &index() const { return index_; } + + std::string ToString() const { + return std::string{StringFromType(type_)} + ":" + std::to_string(index_); + } + + bool operator==(const Device &other) const { + return type_ == other.type_ && index_ == other.index_; + } + + bool operator!=(const Device &other) const { return !(*this == other); } + +private: + Type type_{Type::kCpu}; + + static constexpr ConstexprMap(Device::Type::kCount)> + kDeviceToDesc{{{ + {Type::kCpu, "cpu"}, + {Type::kNvidia, "nvidia"}, + {Type::kCambricon, "cambricon"}, + {Type::kAscend, "ascend"}, + {Type::kMetax, "metax"}, + {Type::kMoore, "moore"}, + {Type::kIluvatar, "iluvatar"}, + {Type::kKunlun, "kunlun"}, + {Type::kHygon, "hygon"}, + {Type::kQy, "qy"}, + }}}; + + static constexpr ConstexprMap(Device::Type::kCount)> + kDescToDevice{{{ + {"cpu", Type::kCpu}, + {"nvidia", Type::kNvidia}, + {"cambricon", Type::kCambricon}, + {"ascend", Type::kAscend}, + {"metax", Type::kMetax}, + {"moore", Type::kMoore}, + {"iluvatar", Type::kIluvatar}, + {"kunlun", Type::kKunlun}, + {"hygon", Type::kHygon}, + {"qy", Type::kQy}, + }}}; + + int index_{0}; +}; + +// Primary template: Devices are disabled by default. Platform-specific +// headers (e.g. `cpu/device_.h`) specialize this to `std::true_type`. +template struct DeviceEnabled : std::false_type {}; + +// Defines the common categories of devices using List. +using AllDeviceTypes = + List; + +// Deferred computation of active devices. The `Filter` and `FilterList` +// evaluation are nested inside a class template so that `DeviceEnabled` +// specializations from platform `device_.h` headers are visible at +// instantiation time. Use with a dependent type parameter +// (e.g. `ActiveDevices`) to ensure deferred instantiation. +template struct ActiveDevicesImpl { + struct Filter { + template + std::enable_if_t::value> + operator()(ValueTag) const {} + }; + + using type = typename FilterList, AllDeviceTypes>::type; +}; + +template using ActiveDevices = typename ActiveDevicesImpl::type; + +/** + * @brief Priority trait for device selection. + */ +template struct DevicePriority { + static constexpr int value = 0; +}; + +template <> struct DevicePriority { + static constexpr int value = 1; +}; + +template <> struct DevicePriority { + static constexpr int value = 5; +}; + +template <> struct DevicePriority { + static constexpr int value = 5; +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_DEVICE_H_ diff --git a/src/dispatcher.h b/src/dispatcher.h new file mode 100644 index 0000000..15b7c59 --- /dev/null +++ b/src/dispatcher.h @@ -0,0 +1,339 @@ +#ifndef INFINI_CCL_DISPATCHER_H_ +#define INFINI_CCL_DISPATCHER_H_ + +#include +#include +#include +#include + +#include "data_type_impl.h" +#include "device.h" +#include "traits.h" + +namespace infini::ccl { + +// ----------------------------------------------------------------------------- +// Core Generic Runtime Dispatchers +// ----------------------------------------------------------------------------- + +namespace detail { + +// Implements the dispatch body over a resolved `List`. +template +auto DispatchFuncImpl(ValueType value, Functor &&func, + std::string_view context_str, List, + Args &&...args) { + using ReturnType = decltype(std::forward(func)( + ValueTag(head)>{}, std::forward(args)...)); + + // Path for void functions. + if constexpr (std::is_void_v) { + bool handled = ((value == static_cast(tail) + ? (std::forward(func)( + ValueTag{}, std::forward(args)...), + true) + : false) || + ... || + (value == static_cast(head) + ? (std::forward(func)( + ValueTag{}, std::forward(args)...), + true) + : false)); + + if (!handled) { + // TODO(lzm): change to use `glog`. + std::cerr << "dispatch error (void): value " << static_cast(value) + << " not supported in the context: " << context_str << "\n"; + std::abort(); + } + } + // Path for non-void functions. + else { + std::optional result; + bool handled = ((value == static_cast(tail) + ? (result.emplace(std::forward(func)( + ValueTag{}, std::forward(args)...)), + true) + : false) || + ... || + (value == static_cast(head) + ? (result.emplace(std::forward(func)( + ValueTag{}, std::forward(args)...)), + true) + : false)); + + if (handled) { + return *result; + } + // TODO(lzm): change to use `glog`. + std::cerr << "dispatch error (non-void): value " << static_cast(value) + << " not supported in the context: " << context_str << "\n"; + std::abort(); + return ReturnType{}; + } +} + +// Deduces `head`/`tail` from a `List` type via partial specialization, +// then forwards to `DispatchFuncImpl`. +template +struct DispatchFuncUnwrap; + +template +struct DispatchFuncUnwrap, + std::tuple> { + static auto call(ValueType value, Functor &&func, + std::string_view context_str, Args &&...args) { + return DispatchFuncImpl(value, std::forward(func), context_str, + List{}, std::forward(args)...); + } +}; + +// Empty-list specialization +template +struct DispatchFuncUnwrap, std::tuple> { + static auto call(ValueType value, Functor &&, std::string_view context_str, + Args &&...) { + // TODO(lzm): change to use `glog`. + std::cerr << "dispatch error: no allowed values registered for value " + << static_cast(value) + << " in the context: " << context_str << "\n"; + std::abort(); + } +}; + +} // namespace detail + +// (Single Dispatch) Dispatches a runtime value to a compile-time functor. +template +auto DispatchFunc(ValueType value, Functor &&func, + std::string_view context_str = "", Args &&...args) { + using FilteredPack = typename Filter, List<>, + all_values...>::type; + + return detail::DispatchFuncUnwrap< + ValueType, Functor, FilteredPack, + std::tuple>::call(value, std::forward(func), + context_str, std::forward(args)...); +} + +// (Multi-Dispatch) Dispatches a vector of runtime values to a compile-time +// functor. +// Base Case: All Dimensions Resolved +template +auto DispatchFunc(const std::vector &values, size_t /*index*/, + Functor &&func, std::string_view /*context_str*/, + List, Args &&...args) { + return std::forward(func)(List{}, + std::forward(args)...); +} + +// Forward declaration of the recursive multi-dispatch overload. +template +auto DispatchFunc(const std::vector &values, size_t index, + Functor &&func, std::string_view context_str, List, + Args &&...args); + +// Adapter used in the recursive multi-dispatch case: given a resolved value +// `val` recurse into the next dimension. +template +struct MultiDispatchRecurseAdapter; + +template +struct MultiDispatchRecurseAdapter, Functor, items...> { + const std::vector &values; + size_t next_index; + Functor &func; + std::string_view context_str; + + template + auto operator()(ValueTag, Args &&...args) const { + return DispatchFunc(values, next_index, func, context_str, + List{}, + std::forward(args)...); + } +}; + +template +auto MultiDispatchFirstDim(const std::vector &values, size_t index, + Functor &func, std::string_view context_str, + List, List, Args &&...args) { + static_assert(sizeof...(allowed) > 0, + "`DispatchFunc` dimension list is empty"); + using EnumType = std::common_type_t; + + MultiDispatchRecurseAdapter adapter{ + values, index + 1, func, context_str}; + + return DispatchFunc( + static_cast(values.at(index)), adapter, context_str, + std::forward(args)...); +} + +// (Multi-Dispatch) Recursive Case +template +auto DispatchFunc(const std::vector &values, size_t index, + Functor &&func, std::string_view context_str, List, + Args &&...args) { + return MultiDispatchFirstDim>( + values, index, func, context_str, List{}, FirstList{}, + std::forward(args)...); +} + +// ----------------------------------------------------------------------------- +// High-Level Specialized Dispatchers +// ----------------------------------------------------------------------------- +// These provide cleaner and more convenient APIs for common InfiniOps types. + +namespace detail { + +// Bridges the generic value dispatch layer to the `DataType`-specific type +// dispatch layer. +template struct DataTypeAdapter { + Functor &func; + + template + auto operator()(ValueTag, Args &&...args) const { + using T = TypeMapType(dtype)>; + return func(TypeTag{}, std::forward(args)...); + } +}; + +template struct DataTypeMultiAdapter { + Functor &func; + + template + auto operator()(List, Args &&...args) const { + return func(TypeTag(dtypes)>>{}..., + std::forward(args)...); + } +}; + +template struct DeviceAdapter { + Functor &func; + + template + auto operator()(ValueTag, Args &&...args) const { + return func(ValueTag{}, std::forward(args)...); + } +}; + +template struct DeviceMultiAdapter { + Functor &func; + + template + auto operator()(List, Args &&...args) const { + return func(ValueTag{}..., std::forward(args)...); + } +}; + +} // namespace detail + +// `DataType` Dispatch +template +auto DispatchFunc(DataType dtype, Functor &&func, + std::string_view context_str = "", Args &&...args) { + detail::DataTypeAdapter> adapter{func}; + return DispatchFunc(dtype, adapter, context_str, + std::forward(args)...); +} + +// `DataType` Multi-Dispatch +template +auto DispatchFunc(std::initializer_list dtypes, Functor &&func, + std::string_view context_str = "", Args &&...args) { + std::vector v; + for (auto d : dtypes) + v.push_back(static_cast(d)); + + detail::DataTypeMultiAdapter> adapter{ + func}; + return DispatchFunc(v, 0, adapter, context_str, List<>{}, + std::forward(args)...); +} + +// `Device` Dispatch +template +auto DispatchFunc(Device::Type device, Functor &&func, + std::string_view context_str = "", Args &&...args) { + detail::DeviceAdapter> adapter{func}; + return DispatchFunc(allowed_devices)...>( + device, adapter, context_str, std::forward(args)...); +} + +// `Device` Multi-Dispatch +template +auto DispatchFunc(std::initializer_list devices, Functor &&func, + std::string_view context_str = "", Args &&...args) { + std::vector v; + for (auto d : devices) + v.push_back(static_cast(d)); + + detail::DeviceMultiAdapter> adapter{func}; + return DispatchFunc(v, 0, adapter, context_str, List<>{}, + std::forward(args)...); +} + +template +auto DispatchFuncListAliasImpl(ValueType value, Functor &&func, + std::string_view context_str, List, + Args &&...args) { + return DispatchFunc>(items)...>( + value, std::forward(func), context_str, + std::forward(args)...); +} + +template +auto DispatchFuncListAliasImpl(ValueType value, Functor &&func, + std::string_view context_str, List, + Args &&...args) { + return DispatchFunc>(items)...>( + value, std::forward(func), context_str, + std::forward(args)...); +} + +// Interface for Generic `List` Aliases (for non-DataType dispatch, e.g. Device) +template ::value>> +auto DispatchFunc(ValueType value, Functor &&func, + std::string_view context_str = "", Args &&...args) { + return DispatchFuncListAliasImpl(value, std::forward(func), + context_str, ListType{}, + std::forward(args)...); +} + +// Interface for Generic `List` Aliases (for DataType dispatch with device type) +template ::value>> +auto DispatchFunc(ValueType value, Functor &&func, + std::string_view context_str = "", Args &&...args) { + return DispatchFuncListAliasImpl(value, std::forward(func), + context_str, ListType{}, + std::forward(args)...); +} + +// Interface for Any `int64_t`-Convertible Types +template +auto DispatchFunc(std::initializer_list keys, Functor &&func, + std::string_view context_str = "", Args &&...args) { + std::vector v_keys(keys); + return DispatchFunc(v_keys, 0, std::forward(func), + context_str, List<>{}, + std::forward(args)...); +} + +} // namespace infini::ccl + +#endif // INFINI_CCL_DISPATCHER_H_ diff --git a/src/logging.h b/src/logging.h new file mode 100644 index 0000000..ed34837 --- /dev/null +++ b/src/logging.h @@ -0,0 +1,48 @@ +#ifndef INFINI_CCL_UTILS_H_ +#define INFINI_CCL_UTILS_H_ + +#include "constexpr_map.h" +#include + +// Internal helper macros to count arguments +#define GET_LOG_MACRO(_1, _2, NAME, ...) NAME + +#define LOG_1(msg) \ + ::infini::ccl::Logger::PrintMsg( \ + (msg), ::infini::ccl::Logger::LogLevel::kError, __FILE__, __LINE__) + +#define LOG_2(msg, level) \ + ::infini::ccl::Logger::PrintMsg((msg), (level), __FILE__, __LINE__) + +// Temporary logging macro. To be replaced by `glog` in the future. +#define LOG(...) GET_LOG_MACRO(__VA_ARGS__, LOG_2, LOG_1)(__VA_ARGS__) + +namespace infini::ccl { + +class Logger { +public: + enum class LogLevel : int8_t { kInfo, kWarning, kError, kFatal, kCount }; + + static void PrintMsg(const char *msg, LogLevel level = LogLevel::kError, + const char *file = __FILE__, int line = __LINE__) { + std::cerr << "[InfiniCCL " << kLogLevelToDesc.at(level) << "] " << file + << ":" << line << " - " << msg << std::endl; + if (level == LogLevel::kFatal) { + std::abort(); + } + } + +private: + static constexpr ConstexprMap(LogLevel::kCount)> + kLogLevelToDesc{{{ + {LogLevel::kInfo, "INFO"}, + {LogLevel::kWarning, "WARNING"}, + {LogLevel::kError, "ERROR"}, + {LogLevel::kFatal, "FATAL"}, + }}}; +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_UTILS_H_ diff --git a/src/metax/data_type_.h b/src/metax/data_type_.h new file mode 100644 index 0000000..7e824f7 --- /dev/null +++ b/src/metax/data_type_.h @@ -0,0 +1,25 @@ +#ifndef INFINI_CCL_METAX_DATA_TYPE__H_ +#define INFINI_CCL_METAX_DATA_TYPE__H_ + +// clang-format off +#include +#include +#include +// clang-format on + +#include "data_type_impl.h" +#include "metax/device_.h" + +namespace infini::ccl { + +template <> struct TypeMap { + using type = __half; +}; + +template <> struct TypeMap { + using type = __maca_bfloat16; +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_METAX_DATA_TYPE__H_ diff --git a/src/metax/device_.h b/src/metax/device_.h new file mode 100644 index 0000000..8660f0e --- /dev/null +++ b/src/metax/device_.h @@ -0,0 +1,12 @@ +#ifndef INFINI_CCL_METAX_DEVICE__H_ +#define INFINI_CCL_METAX_DEVICE__H_ + +#include "device.h" + +namespace infini::ccl { + +template <> struct DeviceEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_METAX_DEVICE__H_ diff --git a/src/metax/runtime_.h b/src/metax/runtime_.h new file mode 100644 index 0000000..393d73b --- /dev/null +++ b/src/metax/runtime_.h @@ -0,0 +1,37 @@ +#ifndef INFINI_CCL_METAX_RUNTIME__H_ +#define INFINI_CCL_METAX_RUNTIME__H_ + +// clang-format off +#include +// clang-format on + +#include "cuda/runtime_.h" +#include "metax/device_.h" + +namespace infini::ccl { + +template <> +struct Runtime + : CudaRuntime> { + using Stream = mcStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMetax; + + static constexpr auto Malloc = mcMalloc; + + static constexpr auto Memcpy = mcMemcpy; + + static constexpr auto Free = mcFree; + + static constexpr auto MemcpyHostToDevice = mcMemcpyHostToDevice; + + static constexpr auto MemcpyDeviceToHost = mcMemcpyDeviceToHost; + + static constexpr auto Memset = mcMemset; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::ccl + +#endif // INFINI_CCL_METAX_RUNTIME__H_ diff --git a/src/nvidia/data_type_.h b/src/nvidia/data_type_.h new file mode 100644 index 0000000..6d686f5 --- /dev/null +++ b/src/nvidia/data_type_.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_NVIDIA_DATA_TYPE__H_ +#define INFINI_CCL_NVIDIA_DATA_TYPE__H_ + +// clang-format off +#include +#include +// clang-format on + +#include "data_type_impl.h" +#include "nvidia/device_.h" + +namespace infini::ccl { + +template <> struct TypeMap { + using type = half; +}; + +template <> struct TypeMap { + using type = __nv_bfloat16; +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_NVIDIA_DATA_TYPE__H_ diff --git a/src/nvidia/device_.h b/src/nvidia/device_.h new file mode 100644 index 0000000..e4b5b29 --- /dev/null +++ b/src/nvidia/device_.h @@ -0,0 +1,12 @@ +#ifndef INFINI_CCL_NVIDIA_DEVICE__H_ +#define INFINI_CCL_NVIDIA_DEVICE__H_ + +#include "device.h" + +namespace infini::ccl { + +template <> struct DeviceEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif diff --git a/src/nvidia/nccl/comm_instance.h b/src/nvidia/nccl/comm_instance.h new file mode 100644 index 0000000..12fe2fc --- /dev/null +++ b/src/nvidia/nccl/comm_instance.h @@ -0,0 +1,17 @@ +#ifndef INFINI_CCL_NVIDIA_NCCL_COMM_INSTANCE_H_ +#define INFINI_CCL_NVIDIA_NCCL_COMM_INSTANCE_H_ + +#include + +#include "communicator.h" + +namespace infini::ccl { + +struct NcclInstance : public BackendCommInstance { + ncclComm_t handle; + NcclInstance() { type = BackendType::kNccl; } +}; + +} // namespace infini::ccl + +#endif diff --git a/src/nvidia/runtime_.h b/src/nvidia/runtime_.h new file mode 100644 index 0000000..452891c --- /dev/null +++ b/src/nvidia/runtime_.h @@ -0,0 +1,47 @@ +#ifndef INFINI_CCL_NVIDIA_RUNTIME_H_ +#define INFINI_CCL_NVIDIA_RUNTIME_H_ + +#include + +// clang-format off +#include +// clang-format on + +#include "cuda/runtime_.h" +#include "nvidia/device_.h" + +namespace infini::ccl { + +template <> +struct Runtime + : CudaRuntime> { + using Stream = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kNvidia; + + static constexpr auto Malloc = [](auto &&...args) { + return cudaMalloc(std::forward(args)...); + }; + + static constexpr auto Memcpy = cudaMemcpy; + + static constexpr auto Free = cudaFree; + + static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice; + + static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost; + + static constexpr auto Memset = cudaMemset; + + static constexpr auto SetDevice = cudaSetDevice; + + static constexpr auto DeviceSynchronize = cudaDeviceSynchronize; + + static constexpr auto StreamSynchronize = cudaStreamSynchronize; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::ccl + +#endif diff --git a/src/ompi/checks.h b/src/ompi/checks.h new file mode 100644 index 0000000..0001f49 --- /dev/null +++ b/src/ompi/checks.h @@ -0,0 +1,29 @@ +#ifndef INFINI_CCL_OMPI_CHECKS_H_ +#define INFINI_CCL_OMPI_CHECKS_H_ + +#include +#include + +#include "return_status_impl.h" + +#define INFINI_CHECK_MPI(result) \ + ::infini::ccl::detail::CheckMpiImpl((result), __FILE__, __LINE__) + +namespace infini::ccl { + +namespace detail { + +inline ReturnStatus CheckMpiImpl(int mpi_result, const char *file, int line) { + if (mpi_result != MPI_SUCCESS) { + std::cerr << "backend(ompi) MPI error code: " << mpi_result << " at line " + << line << " in " << file << std::endl; + std::abort(); + } + return ReturnStatus::kSuccess; +} + +} // namespace detail + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_CHECKS_H_ diff --git a/src/ompi/comm_instance.h b/src/ompi/comm_instance.h new file mode 100644 index 0000000..a37c24d --- /dev/null +++ b/src/ompi/comm_instance.h @@ -0,0 +1,25 @@ +#ifndef INFINI_CCL_OMPI_COMM_INSTANCE_H_ +#define INFINI_CCL_OMPI_COMM_INSTANCE_H_ + +#include + +#include "communicator.h" + +namespace infini::ccl { + +struct OmpiInstance : public BackendCommInstance { + MPI_Comm handle = MPI_COMM_NULL; + + OmpiInstance() { type = BackendType::kOmpi; } + + void Destroy() { + // Ensure we don't accidentally leak if a backend duplicates a communicator. + if (handle != MPI_COMM_WORLD && handle != MPI_COMM_NULL) { + MPI_Comm_free(&handle); + } + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_COMM_INSTANCE_H_ diff --git a/src/ompi/impl/all_reduce.h b/src/ompi/impl/all_reduce.h new file mode 100644 index 0000000..dc87f0c --- /dev/null +++ b/src/ompi/impl/all_reduce.h @@ -0,0 +1,87 @@ +#ifndef INFINI_CCL_OMPI_IMPL_ALL_REDUCE_H_ +#define INFINI_CCL_OMPI_IMPL_ALL_REDUCE_H_ + +#include "base/all_reduce.h" +#include "communicator.h" +#include "dispatcher.h" +#include "logging.h" +#include "ompi/checks.h" +#include "ompi/comm_instance.h" +#include "ompi/type_map.h" + +namespace infini::ccl { + +template +class AllReduceImpl { +public: + static ReturnStatus Apply(const void *send_buff, void *recv_buff, + size_t count, DataType data_type, + ReductionOpType op, Communicator *comm, + void *stream) { + constexpr Device::Type kDev = + ListGetBest(ActiveDevices{}); + + auto *inst = static_cast(comm->inter_comm()); + + if (!inst || inst->handle == MPI_COMM_NULL) { + LOG("Invalid OpenMPI communicator instance for AllReduce."); + return ReturnStatus::kInternalError; + } + + MPI_Datatype mpi_type = DataTypeToOmpiType(data_type); + MPI_Op mpi_op = RedOpToOmpiOp(op); + size_t type_size = kDataTypeToSize.at(data_type); + size_t total_bytes = count * type_size; + + // Handle GPU Memory (Staging Pattern) + // Note: we simply use host-staging for now. + void *host_sendbuf = malloc(total_bytes); + void *host_recvbuf = malloc(total_bytes); + if (!host_sendbuf || !host_recvbuf) { + free(host_sendbuf); + free(host_recvbuf); + LOG("Failed to allocate host buffers for AllReduce staging."); + return ReturnStatus::kSystemError; + } + + Runtime::Memcpy(host_sendbuf, send_buff, total_bytes, + Runtime::MemcpyDeviceToHost); + + Runtime::StreamSynchronize( + static_cast::Stream>(stream)); + + INFINI_CHECK_MPI(MPI_Allreduce(host_sendbuf, host_recvbuf, count, mpi_type, + mpi_op, inst->handle)); + + if (op == ReductionOpType::kAvg) { + int size = comm->size(); + float scale = 1.0f / static_cast(size); + + DispatchFunc(data_type, [&](auto dtype) { + using T = typename decltype(dtype)::type; + + T *typed_buf = static_cast(host_recvbuf); + + // Simply do the averaging on the CPU before the H2D copy. + for (size_t i = 0; i < count; ++i) { + typed_buf[i] *= static_cast(scale); + } + }); + } + + Runtime::Memcpy(recv_buff, host_recvbuf, total_bytes, + Runtime::MemcpyHostToDevice); + + free(host_sendbuf); + free(host_recvbuf); + + return ReturnStatus::kSuccess; + } +}; + +template <> +struct BackendEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_IMPL_ALL_REDUCE_H_ diff --git a/src/ompi/impl/comm_destroy.h b/src/ompi/impl/comm_destroy.h new file mode 100644 index 0000000..03a2464 --- /dev/null +++ b/src/ompi/impl/comm_destroy.h @@ -0,0 +1,35 @@ +#ifndef INFINI_CCL_OMPI_IMPL_COMM_DESTROY_H_ +#define INFINI_CCL_OMPI_IMPL_COMM_DESTROY_H_ + +#include "base/comm_destroy.h" +#include "communicator.h" +#include "ompi/checks.h" +#include "ompi/comm_instance.h" + +namespace infini::ccl { + +template +class CommDestroyImpl { +public: + static ReturnStatus Apply(void *comm) { + auto *comm_internal = static_cast(comm); + if (!comm_internal) { + return ReturnStatus::kInternalError; + } + + if (auto *inter = + static_cast(comm_internal->inter_comm())) { + inter->Destroy(); + } + comm_internal->set_inter_comm(nullptr); + + return ReturnStatus::kSuccess; + } +}; + +template <> +struct BackendEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_IMPL_COMM_DESTROY_H_ diff --git a/src/ompi/impl/comm_init_all.h b/src/ompi/impl/comm_init_all.h new file mode 100644 index 0000000..36fb093 --- /dev/null +++ b/src/ompi/impl/comm_init_all.h @@ -0,0 +1,60 @@ +#ifndef INFINI_CCL_OMPI_IMPL_COMM_INIT_ALL_H_ +#define INFINI_CCL_OMPI_IMPL_COMM_INIT_ALL_H_ + +#include "base/comm_init_all.h" +#include "communicator.h" +#include "logging.h" +#include "ompi/checks.h" +#include "ompi/comm_instance.h" + +namespace infini::ccl { + +template +class CommInitAllImpl { +public: + static ReturnStatus Apply(Communicator *comm, int n_dev, + const int *dev_list) { + constexpr Device::Type kDev = + ListGetBest(ActiveDevices{}); + + if (!comm) { + // TODO(lzm): change to use `glog`. + LOG("Failed to initialize OpenMPI communicator: invalid " + "communicator pointer."); + return ReturnStatus::kInternalError; + } + + int rank, size; + auto inst = std::make_unique(); + INFINI_CHECK_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &inst->handle)); + INFINI_CHECK_MPI(MPI_Comm_rank(inst->handle, &rank)); + INFINI_CHECK_MPI(MPI_Comm_size(inst->handle, &size)); + + comm->set_world_info(rank, size); + comm->set_inter_comm(std::move(inst)); + + int local_rank = 0; + char *local_rank_str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"); + if (local_rank_str) { + local_rank = atoi(local_rank_str); + } + + // Use device from devlist or local_rank + if (dev_list && local_rank < n_dev) { + comm->set_device_id(dev_list[local_rank]); + } else { + comm->set_device_id(local_rank); + } + + Runtime::SetDevice(comm->device_id()); + + return ReturnStatus::kSuccess; + } +}; + +template <> +struct BackendEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_IMPL_COMM_INIT_ALL_H_ diff --git a/src/ompi/impl/finalize.h b/src/ompi/impl/finalize.h new file mode 100644 index 0000000..2787c52 --- /dev/null +++ b/src/ompi/impl/finalize.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_OMPI_IMPL_FINALIZE_H_ +#define INFINI_CCL_OMPI_IMPL_FINALIZE_H_ + +#include "base/finalize.h" +#include "ompi/checks.h" + +namespace infini::ccl { + +template +class FinalizeImpl { +public: + static ReturnStatus Apply() { + int finalized; + INFINI_CHECK_MPI(MPI_Finalized(&finalized)); + return ReturnStatus::kSuccess; + } +}; + +template <> +struct BackendEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_IMPL_FINALIZE_H_ diff --git a/src/ompi/impl/get_rank.h b/src/ompi/impl/get_rank.h new file mode 100644 index 0000000..4664d8d --- /dev/null +++ b/src/ompi/impl/get_rank.h @@ -0,0 +1,23 @@ +#ifndef INFINI_CCL_OMPI_IMPL_GET_RANK_H_ +#define INFINI_CCL_OMPI_IMPL_GET_RANK_H_ + +#include "base/get_rank.h" +#include "ompi/checks.h" + +namespace infini::ccl { + +template +class GetRankImpl { +public: + static ReturnStatus Apply(int *rank) { + INFINI_CHECK_MPI(MPI_Comm_rank(MPI_COMM_WORLD, rank)); + return ReturnStatus::kSuccess; + } +}; + +template <> +struct BackendEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_IMPL_GET_RANK_H_ diff --git a/src/ompi/impl/get_size.h b/src/ompi/impl/get_size.h new file mode 100644 index 0000000..dac8492 --- /dev/null +++ b/src/ompi/impl/get_size.h @@ -0,0 +1,23 @@ +#ifndef INFINI_CCL_OMPI_IMPL_GET_SIZE_H_ +#define INFINI_CCL_OMPI_IMPL_GET_SIZE_H_ + +#include "base/get_size.h" +#include "ompi/checks.h" + +namespace infini::ccl { + +template +class GetSizeImpl { +public: + static ReturnStatus Apply(int *size) { + INFINI_CHECK_MPI(MPI_Comm_size(MPI_COMM_WORLD, size)); + return ReturnStatus::kSuccess; + } +}; + +template <> +struct BackendEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_IMPL_GET_SIZE_H_ diff --git a/src/ompi/impl/init.h b/src/ompi/impl/init.h new file mode 100644 index 0000000..9c47fbb --- /dev/null +++ b/src/ompi/impl/init.h @@ -0,0 +1,40 @@ +#ifndef INFINI_CCL_OMPI_IMPL_INIT_H_ +#define INFINI_CCL_OMPI_IMPL_INIT_H_ + +#include "base/init.h" +#include "ompi/checks.h" + +namespace infini::ccl { + +template +class InitImpl { +public: + static ReturnStatus Apply(int *argc, char ***argv) { + int initialized = 0; + INFINI_CHECK_MPI(MPI_Initialized(&initialized)); + + if (!initialized) { + int provided; + // Use `MPI_Init_thread` to support multi-threaded GPU streams. + INFINI_CHECK_MPI(MPI_Init_thread(argc, argv, required_level_, &provided)); + + if (provided < required_level_) { + // TODO(lzm): change to use `glog`. + std::cerr + << "[InfiniCCL Warning] MPI implementation does not fully support " + << "`MPI_THREAD_SERIALIZED` (provided: " << provided << "). " + << "Concurrent collective operations may be unsafe." << std::endl; + } + } + return ReturnStatus::kSuccess; + } + +private: + constexpr static auto required_level_ = MPI_THREAD_FUNNELED; +}; + +template <> struct BackendEnabled : std::true_type {}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_IMPL_INIT_H_ diff --git a/src/ompi/type_map.h b/src/ompi/type_map.h new file mode 100644 index 0000000..b6872c3 --- /dev/null +++ b/src/ompi/type_map.h @@ -0,0 +1,46 @@ +#ifndef INFINI_CCL_OMPI_TYPE_MAPPING_H_ +#define INFINI_CCL_OMPI_TYPE_MAPPING_H_ + +#include + +#include "comm_impl.h" +#include "data_type_impl.h" + +namespace infini::ccl { + +static const ConstexprMap kOmpiTypeMap{{{ + {DataType::kInt8, MPI_INT8_T}, + {DataType::kInt16, MPI_INT16_T}, + {DataType::kInt32, MPI_INT32_T}, + {DataType::kInt64, MPI_INT64_T}, + {DataType::kUInt8, MPI_UINT8_T}, + {DataType::kUInt16, MPI_UINT16_T}, + {DataType::kUInt32, MPI_UINT32_T}, + {DataType::kUInt64, MPI_UINT64_T}, + {DataType::kFloat32, MPI_FLOAT}, + {DataType::kFloat64, MPI_DOUBLE}, + {DataType::kFloat16, MPI_BYTE}, + {DataType::kBFloat16, MPI_BYTE}, +}}}; + +static const ConstexprMap kOmpiOpMap{{{ + {ReductionOpType::kSum, MPI_SUM}, + {ReductionOpType::kProd, MPI_PROD}, + {ReductionOpType::kMax, MPI_MAX}, + {ReductionOpType::kMin, MPI_MIN}, + // OpenMPI does not support native average reduction, so we map it to sum + // and handle the scaling manually. + {ReductionOpType::kAvg, MPI_SUM}, +}}}; + +inline MPI_Datatype DataTypeToOmpiType(DataType dtype) { + return kOmpiTypeMap.at(dtype); +} + +inline MPI_Op RedOpToOmpiOp(ReductionOpType red_op) { + return kOmpiOpMap.at(red_op); +} + +} // namespace infini::ccl + +#endif // INFINI_CCL_OMPI_TYPE_MAPPING_H_ diff --git a/src/operation.h b/src/operation.h new file mode 100644 index 0000000..c7dc630 --- /dev/null +++ b/src/operation.h @@ -0,0 +1,45 @@ +#ifndef INFINI_CCL_OPERATION_H_ +#define INFINI_CCL_OPERATION_H_ + +#include + +#include "backend.h" +#include "device.h" +#include "dispatcher.h" +#include "traits.h" + +namespace infini::ccl { + +template +class Operation { +public: + template static auto Call(Args &&...args) { + constexpr BackendType kBestBack = + ListGetBest(ActiveBackends{}); + constexpr Device::Type kBestDev = + ListGetBest(ActiveDevices{}); + + return Call(kBestBack, kBestDev, std::forward(args)...); + } + + template + static auto Call(BackendType backend, Device::Type device, Args &&...args) { + return DispatchFunc, ActiveDevices>( + {static_cast(backend), static_cast(device)}, + [&](auto resolved_list) { + constexpr BackendType kBackend = + static_cast(ListGet<0>(resolved_list)); + constexpr Device::Type kDevice = + static_cast(ListGet<1>(resolved_list)); + + return Key::template Execute( + std::forward(args)...); + }, + "Operation::Call"); + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_OPERATION_H_ diff --git a/src/return_status_impl.h b/src/return_status_impl.h new file mode 100644 index 0000000..30db5e7 --- /dev/null +++ b/src/return_status_impl.h @@ -0,0 +1,24 @@ +#ifndef INFINI_CCL_RETURN_STATUS_IMPL_H_ +#define INFINI_CCL_RETURN_STATUS_IMPL_H_ + +#include + +#include "return_status.h" + +namespace infini::ccl { + +enum class ReturnStatus : int8_t { + kSuccess = infiniSuccess, + kUnhandledError = infiniUnhandledError, + kSystemError = infiniSystemError, + kInternalError = infiniInternalError, + kInvalidArgument = infiniInvalidArgument, + kInvalidUsage = infiniInvalidUsage, + kRemoteError = infiniRemoteError, + kInProgress = infiniInProgress, + kNumResults = infiniNumResults, +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_RETURN_STATUS_IMPL_H_ diff --git a/src/runtime.h b/src/runtime.h new file mode 100644 index 0000000..1e603c6 --- /dev/null +++ b/src/runtime.h @@ -0,0 +1,52 @@ +#ifndef INFINI_CCL_RUNTIME_H_ +#define INFINI_CCL_RUNTIME_H_ + +#include + +#include "device.h" + +namespace infini::ccl { + +template struct Runtime; + +/// ## Interface enforcement via CRTP. +/// +/// Inherit from the appropriate base to declare which interface level a +/// `Runtime` specialization implements. After the struct is fully defined, call +/// `static_assert(Runtime<...>::Validate())`. The chained `Validate()` checks +/// every required member's existence and signature at compile time, analogous +/// to how `override` catches signature mismatches for virtual functions. +/// +/// - `RuntimeBase`: `kDeviceType` only (e.g. CPU). +/// - `DeviceRuntime`: adds `Stream`, `Malloc`, and `Free` (e.g. Cambricon). + +/// Every Runtime must provide `static constexpr Device::Type kDeviceType`. +template struct RuntimeBase { + static constexpr bool Validate() { + static_assert( + std::is_same_v, + Device::Type>, + "`Runtime` must define `static constexpr Device::Type kDeviceType`."); + return true; + } +}; + +/// Runtimes with device memory must additionally provide `Stream`, `Malloc`, +/// and `Free`. +template struct DeviceRuntime : RuntimeBase { + static constexpr bool Validate() { + RuntimeBase::Validate(); + static_assert(sizeof(typename Derived::Stream) > 0, + "`Runtime` must define a `Stream` type alias."); + static_assert( + std::is_invocable_v, + "`Runtime::Malloc` must be callable with `(void**, size_t)`."); + static_assert(std::is_invocable_v, + "`Runtime::Free` must be callable with `(void*)`."); + return true; + } +}; + +} // namespace infini::ccl + +#endif // INFINI_CCL_RUNTIME_H_ diff --git a/src/traits.h b/src/traits.h new file mode 100644 index 0000000..d045752 --- /dev/null +++ b/src/traits.h @@ -0,0 +1,198 @@ +#ifndef INFINI_CCL_TRAITS_H_ +#define INFINI_CCL_TRAITS_H_ + +#include +#include + +namespace infini::ccl { + +// --------------------- List and TypePack --------------------- +// A generic container for a sequence of compile-time values. +template struct List {}; + +template struct TypePack {}; + +// ----------------------------------------------------------------------------- +// Tags +// ----------------------------------------------------------------------------- +// Tags are passed as regular function arguments to user functors instead of +// template parameters. This lets users write plain C++17 `[](auto tag)` lambdas +// rather than C++20 template lambdas (`[]()`). + +// `TypeTag`: carries a C++ type. Recover with `typename +// decltype(tag)::type`. +template struct TypeTag { + using type = T; +}; + +// `ValueTag`: carries a compile-time value. Recover with +// `decltype(tag)::value`. +template struct ValueTag { + using value_type = decltype(v); + static constexpr auto value = v; +}; + +// ----------------------------------------------------------------------------- +// List Queries +// ----------------------------------------------------------------------------- + +// Check at compile-time if a value exists within a construct (e.g., `List<>`). +// Example: `static_assert(ContainsValue)`; +template struct Contains; + +template +struct Contains, value> + : std::disjunction...> {}; + +template +inline constexpr bool ContainsValue = Contains::value; + +// Check at compile-time if a type `T` is present in a variadic list of types +// `Ts`. +// Example: `static_assert(IsTypeInList)`; +template +inline constexpr bool IsTypeInList = (std::is_same_v || ...); + +// Trait to detect whether `T` is a `List<...>` specialization. +template struct IsListType : std::false_type {}; + +template struct IsListType> : std::true_type {}; + +// ----------------------------------------------------------------------------- +// List Operations +// ----------------------------------------------------------------------------- + +template +constexpr auto ListGetImpl(List) { + if constexpr (index == 0) + return head; + else + return ListGetImpl(List{}); +} + +// `ListGet(List{})` extracts the `i`th value from a `List` +// tag. +// Usage: `ListGet<1>(List<10, 20, 30>{})` is `20`. +template +constexpr auto ListGet(List list) { + return ListGetImpl(list); +} + +// `ListSize` gets the size of a List. +// Usage: `ListSize>::value` is `3`. +template struct ListSize; + +template struct ListSize> { + static constexpr size_t value = sizeof...(Args); +}; + +// Concatenates two List types into a single `List`. +// Example: `ConcatType, List<3, 4>>` is `List<1, 2, 3, 4>`. +template struct Concat; + +template +struct Concat, List> { + using type = List; +}; + +template +using ConcatType = typename Concat::type; + +// Flatten multi-level `List` into a single `List`. +// Example: `Flatten, List<2, 3>, List<4>>::type` is `List<1, 2, 3, 4>`. +template struct Flatten; + +template struct Flatten> { + using type = List; +}; + +template +struct Flatten { + using type = typename Flatten, Rest...>::type; +}; + +// Generic recursion to find the "best" element based on a `PriorityTrait`. +template