Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e308e90
feat: add data type related interface and implementation as well as O…
Ziminli Apr 3, 2026
83c0334
feat: add the external and internal interfaces for return code/status.
Ziminli Apr 3, 2026
2f539ad
feat: add the definitions of device, dispatcher, and compile-time tra…
Ziminli Apr 3, 2026
175837a
feat: add `src/backend.h` which contains the definition of `BackendType`
Ziminli Apr 9, 2026
faa40a3
feat: add runtime and platform specialization of `DeviceEnabled` for …
Ziminli Apr 9, 2026
109d36e
refactor: change `DataType` and `ReturnStatus` from aliasing to scope…
Ziminli Apr 10, 2026
5daec20
feat: add `Communicator` and `BackendCommInstance`
Ziminli Apr 10, 2026
7a03f50
style: add comments for the `#endif` in various files
Ziminli Apr 11, 2026
cfaf3fa
feat: add priority levels for `BackendType` and `Device::Type` and `O…
Ziminli Apr 11, 2026
3ac34be
feat: add the definition of some communication functions and inplemen…
Ziminli Apr 11, 2026
bf969ac
feat: add `TypeMap` and `DataTypeMap`, device-dependent bf16 and fp16…
Ziminli Apr 14, 2026
d164d07
feat: add `Operation` and function calling chain using `Init`/`infini…
Ziminli Apr 14, 2026
572528d
build: add CMake build system and incomplete allreduce example program
Ziminli Apr 14, 2026
98e5adb
build: refine header visibility and include interfaces
Ziminli Apr 14, 2026
58257e4
feat: enable the allreduce example with internal runtime
Ziminli Apr 15, 2026
f1527cb
feat: support `infiniGetRank()` and add rank-related info printing in…
Ziminli Apr 15, 2026
d9607dd
feat: support `infiniGetSize()` and add its usage in `examples/all_re…
Ziminli Apr 15, 2026
41e3bca
feat: create `examples/utils.h` and support `infiniFinalize()`
Ziminli Apr 15, 2026
12e0321
feat: support `infiniCommInitAll()` and `infiniCommDestroy()`, suppor…
Ziminli Apr 15, 2026
7102b31
feat: add a simple `Logger` and its `PrintMsg()` method for unifying …
Ziminli Apr 16, 2026
d32d9c2
refactor: add `LOG` macro for convenient logging and change some `TOD…
Ziminli Apr 16, 2026
80d4b34
feat: support `infiniAllReduce()` and its ompi backend, add profiling…
Ziminli Apr 16, 2026
a0c755a
refactor: add warm-up iteration and profiling iteration for `examples…
Ziminli Apr 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Build artifacts
/build/
include/infiniccl.h

# Prerequisites
*.d

Expand Down
112 changes: 112 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
cmake_minimum_required(VERSION 3.18)
project(InfiniCCL LANGUAGES C CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# =========================================================
# --- DEVICE OPTIONS (Hardware Runtimes) ---
# =========================================================
option(WITH_NVIDIA "Enable NVIDIA GPU support" OFF)
option(WITH_METAX "Enable MetaX GPU support" OFF)

set(WITH_CPU ON CACHE INTERNAL "CPU backend is always enabled")

# =========================================================
# --- BACKEND OPTIONS (Communication Protocols) ---
# =========================================================
option(WITH_OMPI "Enable OpenMPI backend" OFF)
option(WITH_NCCL "Enable NCCL backend" OFF)
# OMPI is the default bootstrap/CPU backend
if(NOT WITH_OMPI AND NOT WITH_NCCL)
set(WITH_OMPI ON)
message(STATUS "No backend specified. Defaulting to WITH_OMPI=ON")
endif()

# =========================================================
# --- MISC. BUILD OPTIONS ---
# =========================================================
option(AUTO_DETECT_DEVICES "Automatically detect available devices" ON)

if(AUTO_DETECT_DEVICES)
message(STATUS "Auto-detecting available devices...")

# NVIDIA
file(GLOB NVIDIA_DEV_FILES "/dev/nvidia*")

if(NVIDIA_DEV_FILES)
set(WITH_NVIDIA ON)
message(STATUS "Auto-detected NVIDIA environment.")
endif()

# MetaX
if(DEFINED ENV{MACA_PATH})
set(WITH_METAX ON)
message(STATUS "Auto-detected MetaX environment from MACA_PATH")
else()
execute_process(
COMMAND sh -c "grep -h 9999 /sys/bus/pci/devices/*/vendor 2>/dev/null"
OUTPUT_VARIABLE _pci_vendor_output
OUTPUT_STRIP_TRAILING_WHITESPACE
)

string(FIND "${_pci_vendor_output}" "9999" _found_pos)

if(_found_pos GREATER -1)
set(WITH_METAX ON)
message(STATUS "Detected MetaX GPU from PCI vendor ID 0x9999")
else()
set(WITH_METAX OFF)
message(STATUS "No MetaX GPU detected")
endif()
endif()
endif()

# =========================================================
# --- DEPENDENCY DISCOVERY ---
# =========================================================
if(WITH_NVIDIA)
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
endif()

if(WITH_METAX)
set(MACA_PATH $ENV{MACA_PATH})
if(NOT MACA_PATH)
# Normally can be found at: `/opt/maca/`.
set(MACA_PATH "/opt/maca")
message(WARNING "MACA_PATH environment variable not set. Defaulting to /opt/maca/. Please verify this path is correct.")
endif()

set(CMAKE_C_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mxcc_wrapper.sh)
set(CMAKE_CXX_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mxcc_wrapper.sh)

include_directories("${MACA_PATH}/include")
link_directories("${MACA_PATH}/lib")

find_library(MACA_RUNTIME_LIB NAMES mcruntime HINTS "${MACA_PATH}/lib" REQUIRED)
endif()

if(WITH_OMPI)
find_package(MPI REQUIRED)
endif()

if(WITH_NCCL)
if (NOT WITH_NVIDIA)
message(FATAL_ERROR "NCCL backend requires NVIDIA GPU support. Please enable WITH_NVIDIA.")
endif()

find_library(NCCL_LIB NAMES nccl REQUIRED)
find_path(NCCL_INC NAMES nccl.h REQUIRED)

include_directories(${NCCL_INC})
endif()

# Python is required for code generation
find_package(Python3 REQUIRED)

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})

add_subdirectory(src)
add_subdirectory(examples)
26 changes: 26 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
file(GLOB_RECURSE EXAMPLE_SOURCES "*.cc")

foreach(source_file ${EXAMPLE_SOURCES})
get_filename_component(example_name ${source_file} NAME_WE)

add_executable(${example_name} ${source_file})

target_link_libraries(${example_name} PRIVATE infiniccl)

# Add runtime and backend dependencies for direct runtime/backend usage
if(WITH_NVIDIA)
target_link_libraries(${example_name} PRIVATE CUDA::cudart)
endif()

if(WITH_OMPI)
target_link_libraries(${example_name} PRIVATE MPI::MPI_CXX)
endif()

# Explicitly allow examples to "peek" into the internal src and binary dirs
# This is necessary because these were marked PRIVATE in the library's CMake
target_include_directories(${example_name} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
"${PROJECT_SOURCE_DIR}/src" # For internal templates like runtime.h
"${CMAKE_BINARY_DIR}/src" # For the generated backend_manifest.h
)
endforeach()
142 changes: 142 additions & 0 deletions examples/all_reduce.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/**
* InfiniCCL Example: AllReduce
* * This example demonstrates the planned API for performing a
* collective sum-reduction across multiple GPUs and nodes.
*/

#include <iostream>
#include <unistd.h>
#include <vector>

// Public API
#include "infiniccl.h"

// Example-specific utilities
#include "utils.h"

// Internal Headers (Accessible via example-specific include paths, technically
// not public APIs)
#include "backend_manifest.h"
#include "device.h"
#include "runtime.h"
#include "traits.h"

using namespace infini::ccl;

void RunAllReduceExample(int argc, char **argv, int warmup_iter,
int profile_iter, const size_t kNumElements) {
constexpr Device::Type kDevType =
ListGetBest<DevicePriority>(EnabledDevices{});

CHECK_INFINI(infiniInit(&argc, &argv));

int rank, size;
CHECK_INFINI(infiniGetRank(&rank));
CHECK_INFINI(infiniGetSize(&size));

char hostname[256];
gethostname(hostname, sizeof(hostname));

// Map local rank to GPU device.
// Note: this is just for info printing. In practice, this part is not needed.
const char *local_rank_str = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK");
int local_rank = 0;
if (local_rank_str != nullptr) {
local_rank = std::atoi(local_rank_str);
}

std::cout << "[Rank " << rank << "] Host: " << hostname
<< " | GPU: " << Device::StringFromType(kDevType) << " "
<< " | Device " << local_rank << std::endl;

// Setup Communicator
infiniComm_t comm;
CHECK_INFINI(infiniCommInitAll(&comm, size, nullptr));

// Prepare Data
std::vector<float> h_send(kNumElements);
std::vector<float> h_recv(kNumElements, 0.0f);

// Initialize: each rank provides its (rank + 1) as data
for (size_t i = 0; i < kNumElements; i++) {
h_send[i] = static_cast<float>(rank + 1);
}

float *d_send, *d_recv;
size_t total_bytes = kNumElements * sizeof(*d_send);
Runtime<kDevType>::Malloc(&d_send, total_bytes);
Runtime<kDevType>::Malloc(&d_recv, total_bytes);
Runtime<kDevType>::Memcpy(d_send, h_send.data(), total_bytes,
Runtime<kDevType>::MemcpyHostToDevice);
Runtime<kDevType>::Memcpy(d_recv, h_recv.data(), total_bytes,
Runtime<kDevType>::MemcpyHostToDevice);

if (rank == 0) {
std::cout << "\n=== Performing AllReduce on GPU Memory ===" << std::endl;
std::cout << "Data size: " << kNumElements << " floats ("
<< total_bytes / 1024 / 1024 << " MB)" << std::endl;
std::cout << "Operation: Sum" << std::endl;
std::cout << "Warm-up iterations: " << warmup_iter << std::endl;
std::cout << "Profile iterations: " << profile_iter << std::endl;
}

Runtime<kDevType>::StreamSynchronize(nullptr);

// warm-up and D2H transfer the answer
CHECK_INFINI(infiniAllReduce(d_send, d_recv, kNumElements, infiniFloat32,
infiniSum, comm, nullptr));
Runtime<kDevType>::Memcpy(h_recv.data(), d_recv, kNumElements * sizeof(float),
Runtime<kDevType>::MemcpyDeviceToHost);

for (int i = 1; i < warmup_iter; ++i) {
CHECK_INFINI(infiniAllReduce(d_send, d_recv, kNumElements, infiniFloat32,
infiniSum, comm, nullptr));
}
Runtime<kDevType>::StreamSynchronize(nullptr);

// Profiling
Timer timer;

for (int i = 0; i < profile_iter; i++) {
CHECK_INFINI(infiniAllReduce(d_send, d_recv, kNumElements, infiniFloat32,
infiniSum, comm, nullptr));
}

Runtime<kDevType>::StreamSynchronize(nullptr);
double elapsed = timer.elapsed_ms() / static_cast<double>(profile_iter);

// Result Validation
float expected = 0.0f;
for (int r = 0; r < size; r++) {
expected += static_cast<float>(r + 1);
}

Validator::ValidateResult(h_recv.data(), kNumElements, expected, rank);

// Metrics Reporting (Only from rank 0 for cleaner output)
if (rank == 0) {
Metrics metrics{elapsed, total_bytes, size};
metrics.Print();
}

// Cleanup
Runtime<kDevType>::Free(d_send);
Runtime<kDevType>::Free(d_recv);

CHECK_INFINI(infiniCommDestroy(comm));
CHECK_INFINI(infiniFinalize());

if (rank == 0) {
std::cout << "InfiniCCL finalized." << std::endl;
}
}

int main(int argc, char **argv) {
int warmup_iters = 2;
int profile_iters = 20;
size_t num_elements = 1 << 20;

RunAllReduceExample(argc, argv, warmup_iters, profile_iters, num_elements);

return EXIT_SUCCESS;
}
100 changes: 100 additions & 0 deletions examples/utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#ifndef INFINI_CCL_EXAMPLES_UTILS_H_
#define INFINI_CCL_EXAMPLES_UTILS_H_

#include <chrono>
#include <cmath>
#include <iomanip>
#include <iostream>
#include <string>
#include <vector>

// Simple check macro for the C-API
#define CHECK_INFINI(cmd) \
do { \
infiniResult_t res = (cmd); \
if (res != infiniSuccess) { \
std::cerr << "[InfiniCCL Error] received error code " << res \
<< " at line " << __LINE__ << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)

// Simple Timer for profiling
class Timer {
std::chrono::high_resolution_clock::time_point start;

public:
Timer() : start(std::chrono::high_resolution_clock::now()) {}
double elapsed_ms() const {
auto end = std::chrono::high_resolution_clock::now();
return std::chrono::duration<double, std::milli>(end - start).count();
}
};

struct Metrics {
double elapsed_ms;
size_t total_bytes;
int world_size;

void Print() const {
double seconds = elapsed_ms / 1000.0;
double gigabytes =
static_cast<double>(total_bytes) / (1024.0 * 1024.0 * 1024.0);

// Industry standard formula: 2 * (n-1) / n
double bus_bw =
(2.0 * (world_size - 1) / world_size) * (gigabytes / seconds);
double alg_bw = gigabytes / seconds;

std::cout << "Time: " << std::fixed << std::setprecision(3)
<< elapsed_ms << " ms" << std::endl;
std::cout << "Throughput: " << std::fixed << std::setprecision(2)
<< bus_bw << " GB/s (Bus BW)" << std::endl;
std::cout << "Alg Bandwidth: " << std::fixed << std::setprecision(2)
<< alg_bw << " GB/s" << std::endl;
}
};

class Validator {
public:
template <typename T>
static bool ValidateResult(const T *data, size_t count, T expected_val,
int rank) {
bool correct = true;
int error_count = 0;

for (size_t i = 0; i < count; ++i) {
if (std::fabs(static_cast<double>(data[i]) -
static_cast<double>(expected_val)) > 1e-3) {
correct = false;
error_count++;
if (error_count <= 3 && rank == 0) {
std::cerr << "Error at index " << i << ": " << data[i]
<< " != " << expected_val << std::endl;
}
}
}

if (rank == 0) {
const char *GREEN = "\033[32m";
const char *RED = "\033[31m";
const char *RESET = "\033[0m";

std::cout << "\n=== AllReduce Results ===" << std::endl;
std::cout << "Correct: "
<< (correct ? (GREEN + std::string("YES") + RESET)
: (RED + std::string("NO") + RESET));
if (!correct)
std::cout << " (" << error_count << " errors)";
std::cout << std::endl;

std::cout << "Expect: " << std::fixed << std::setprecision(2)
<< static_cast<double>(expected_val) << std::endl;
std::cout << "Actual: " << std::fixed << std::setprecision(2)
<< static_cast<double>(data[0]) << std::endl;
}
return correct;
}
};

#endif
Loading