diff --git a/.ci/config.yaml b/.ci/config.yaml index ea6a0d48..b53a269a 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -115,7 +115,7 @@ platforms: timeout: 3600 stages: - name: test - run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/test_add.py tests/test_cast.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml cambricon: image: diff --git a/src/cuda/cast/kernel.cuh b/src/cuda/cast/kernel.cuh new file mode 100644 index 00000000..439529f3 --- /dev/null +++ b/src/cuda/cast/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef INFINI_OPS_CUDA_CAST_KERNEL_CUH_ +#define INFINI_OPS_CUDA_CAST_KERNEL_CUH_ + +#include "cuda/kernel_commons.cuh" + +namespace infini::ops { + +template +__global__ void CastKernel(OutT* __restrict__ out, + const InT* __restrict__ input, + const size_t* __restrict__ out_shape, + const size_t* __restrict__ input_shape, + const ptrdiff_t* __restrict__ out_strides, + const ptrdiff_t* __restrict__ input_strides, + size_t output_size, size_t ndim, bool out_contiguous, + bool input_contiguous) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < output_size) { + size_t out_idx = + out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides); + size_t input_idx = + input_contiguous ? idx + : IndexToOffset(idx, ndim, input_shape, input_strides); + + out[out_idx] = Caster::template Cast(input[input_idx]); + } +} + +} // namespace infini::ops + +#endif diff --git a/src/cuda/cast/kernel.h b/src/cuda/cast/kernel.h new file mode 100644 index 00000000..756b9317 --- /dev/null +++ b/src/cuda/cast/kernel.h @@ -0,0 +1,93 @@ +#ifndef INFINI_OPS_CUDA_CAST_KERNEL_H_ +#define INFINI_OPS_CUDA_CAST_KERNEL_H_ + +#include +#include +#include +#include +#include + +#include "base/cast.h" +#include "cuda/cast/kernel.cuh" +#include "cuda/runtime_utils.h" + +namespace infini::ops { + +template +class CudaCast : public Cast { + public: + CudaCast(const Tensor input, Tensor out) : Cast{input, out} { + size_t shape_size = ndim_ * sizeof(*d_input_shape_); + size_t strides_size = ndim_ * sizeof(*d_input_strides_); + const size_t metadata_size = 2 * (shape_size + strides_size); + std::vector metadata(metadata_size); + + Backend::Malloc((void**)&d_metadata_, metadata_size); + + size_t offset = 0; + d_input_shape_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, input_shape_.data(), shape_size); + offset += shape_size; + + d_out_shape_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size); + offset += shape_size; + + d_input_strides_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, input_strides_.data(), strides_size); + offset += strides_size; + + d_out_strides_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size); + + Backend::Memcpy(d_metadata_, metadata.data(), metadata_size, + Backend::MemcpyHostToDevice); + } + + ~CudaCast() { Backend::Free(d_metadata_); } + + void operator()(const Tensor input, Tensor out) const override { + if (output_size_ == 0) { + return; + } + + int block_size = RuntimeUtils::GetOptimalBlockSize(); + + DispatchFunc( + {static_cast(input_dtype_), static_cast(out_dtype_)}, + [&](auto list_tag) { + using InT = TypeMapType(list_tag)>; + using OutT = TypeMapType(list_tag)>; + + auto cuda_stream = + static_cast(stream_ ? stream_ : 0); + dim3 block_dims( + std::min(static_cast(block_size), output_size_)); + dim3 grid_dims(utils::CeilDiv(output_size_, block_dims.x)); + + CastKernel + <<>>( + reinterpret_cast(out.data()), + reinterpret_cast(input.data()), d_out_shape_, + d_input_shape_, d_out_strides_, d_input_strides_, + output_size_, ndim_, is_out_contiguous_, + is_input_contiguous_); + }, + "CudaCast::operator()"); + } + + private: + std::byte* d_metadata_{nullptr}; + + Tensor::Size* d_input_shape_{nullptr}; + + Tensor::Size* d_out_shape_{nullptr}; + + Tensor::Stride* d_input_strides_{nullptr}; + + Tensor::Stride* d_out_strides_{nullptr}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/iluvatar/cast/kernel.h b/src/cuda/iluvatar/cast/kernel.h new file mode 100644 index 00000000..9843e446 --- /dev/null +++ b/src/cuda/iluvatar/cast/kernel.h @@ -0,0 +1,21 @@ +#ifndef INFINI_OPS_ILUVATAR_CAST_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_CAST_KERNEL_H_ + +#include + +#include "cuda/cast/kernel.h" +#include "cuda/iluvatar/caster.cuh" +#include "cuda/iluvatar/runtime_.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaCast> { + public: + using CudaCast>::CudaCast; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/metax/cast/kernel.h b/src/cuda/metax/cast/kernel.h new file mode 100644 index 00000000..05735e7e --- /dev/null +++ b/src/cuda/metax/cast/kernel.h @@ -0,0 +1,21 @@ +#ifndef INFINI_OPS_METAX_CAST_KERNEL_H_ +#define INFINI_OPS_METAX_CAST_KERNEL_H_ + +#include + +#include "cuda/cast/kernel.h" +#include "cuda/metax/caster.cuh" +#include "cuda/metax/runtime_.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaCast> { + public: + using CudaCast>::CudaCast; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/moore/cast/kernel.h b/src/cuda/moore/cast/kernel.h new file mode 100644 index 00000000..51dedf79 --- /dev/null +++ b/src/cuda/moore/cast/kernel.h @@ -0,0 +1,26 @@ +#ifndef INFINI_OPS_MOORE_CAST_KERNEL_H_ +#define INFINI_OPS_MOORE_CAST_KERNEL_H_ + +#include + +// clang-format off +#include "cuda/moore/polyfills.cuh" +// clang-format on + +#include "cuda/cast/kernel.h" +#include "cuda/moore/caster.cuh" +#include "cuda/moore/polyfills.cuh" +#include "cuda/moore/runtime_.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaCast> { + public: + using CudaCast>::CudaCast; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/nvidia/cast/kernel.h b/src/cuda/nvidia/cast/kernel.h new file mode 100644 index 00000000..0e69a031 --- /dev/null +++ b/src/cuda/nvidia/cast/kernel.h @@ -0,0 +1,21 @@ +#ifndef INFINI_OPS_NVIDIA_CAST_KERNEL_H_ +#define INFINI_OPS_NVIDIA_CAST_KERNEL_H_ + +#include + +#include "cuda/cast/kernel.h" +#include "cuda/nvidia/caster.cuh" +#include "cuda/nvidia/runtime_.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaCast> { + public: + using CudaCast>::CudaCast; +}; + +} // namespace infini::ops + +#endif diff --git a/tests/test_cast.py b/tests/test_cast.py index bd19d934..0f2959a5 100644 --- a/tests/test_cast.py +++ b/tests/test_cast.py @@ -14,6 +14,8 @@ ((13, 4, 4), None, None), ((16, 5632), None, None), ((4, 4, 5632), None, None), + ((0, 16), None, None), + ((4, 0, 8), None, None), ), ) @pytest.mark.parametrize(