diff --git a/.ci/config.yaml b/.ci/config.yaml
index ea6a0d48..b53a269a 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -115,7 +115,7 @@ platforms:
           timeout: 3600
         stages:
           - name: test
-            run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+            run: pytest tests/test_add.py tests/test_cast.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
   cambricon:
     image:
diff --git a/src/cuda/cast/kernel.cuh b/src/cuda/cast/kernel.cuh
new file mode 100644
index 00000000..439529f3
--- /dev/null
+++ b/src/cuda/cast/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef INFINI_OPS_CUDA_CAST_KERNEL_CUH_
+#define INFINI_OPS_CUDA_CAST_KERNEL_CUH_
+
+#include "cuda/kernel_commons.cuh"
+
+namespace infini::ops {
+
+template <Device::Type kDev, typename InT, typename OutT>
+__global__ void CastKernel(OutT* __restrict__ out,
+                           const InT* __restrict__ input,
+                           const size_t* __restrict__ out_shape,
+                           const size_t* __restrict__ input_shape,
+                           const ptrdiff_t* __restrict__ out_strides,
+                           const ptrdiff_t* __restrict__ input_strides,
+                           size_t output_size, size_t ndim, bool out_contiguous,
+                           bool input_contiguous) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < output_size) {
+    size_t out_idx =
+        out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides);
+    size_t input_idx =
+        input_contiguous ? idx
+                         : IndexToOffset(idx, ndim, input_shape, input_strides);
+
+    out[out_idx] = Caster<kDev>::template Cast<OutT>(input[input_idx]);
+  }
+}
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/cast/kernel.h b/src/cuda/cast/kernel.h
new file mode 100644
index 00000000..756b9317
--- /dev/null
+++ b/src/cuda/cast/kernel.h
@@ -0,0 +1,93 @@
+#ifndef INFINI_OPS_CUDA_CAST_KERNEL_H_
+#define INFINI_OPS_CUDA_CAST_KERNEL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "base/cast.h"
+#include "cuda/cast/kernel.cuh"
+#include "cuda/runtime_utils.h"
+
+namespace infini::ops {
+
+template <typename Backend>
+class CudaCast : public Cast {
+ public:
+  CudaCast(const Tensor input, Tensor out) : Cast{input, out} {
+    size_t shape_size = ndim_ * sizeof(*d_input_shape_);
+    size_t strides_size = ndim_ * sizeof(*d_input_strides_);
+    const size_t metadata_size = 2 * (shape_size + strides_size);
+    std::vector<std::byte> metadata(metadata_size);
+
+    Backend::Malloc((void**)&d_metadata_, metadata_size);
+
+    size_t offset = 0;
+    d_input_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, input_shape_.data(), shape_size);
+    offset += shape_size;
+
+    d_out_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size);
+    offset += shape_size;
+
+    d_input_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, input_strides_.data(), strides_size);
+    offset += strides_size;
+
+    d_out_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size);
+
+    Backend::Memcpy(d_metadata_, metadata.data(), metadata_size,
+                    Backend::MemcpyHostToDevice);
+  }
+
+  ~CudaCast() { Backend::Free(d_metadata_); }
+
+  void operator()(const Tensor input, Tensor out) const override {
+    if (output_size_ == 0) {
+      return;
+    }
+
+    int block_size = RuntimeUtils<Backend::kDeviceType>::GetOptimalBlockSize();
+
+    DispatchFunc<AllTypes, AllTypes>(
+        {static_cast<int64_t>(input_dtype_), static_cast<int64_t>(out_dtype_)},
+        [&](auto list_tag) {
+          using InT = TypeMapType<Backend::kDeviceType, ListGet<0>(list_tag)>;
+          using OutT = TypeMapType<Backend::kDeviceType, ListGet<1>(list_tag)>;
+
+          auto cuda_stream =
+              static_cast<typename Backend::Stream>(stream_ ? stream_ : 0);
+          dim3 block_dims(
+              std::min(static_cast<Tensor::Size>(block_size), output_size_));
+          dim3 grid_dims(utils::CeilDiv(output_size_, block_dims.x));
+
+          CastKernel<Backend::kDeviceType, InT, OutT>
+              <<<grid_dims, block_dims, 0, cuda_stream>>>(
+                  reinterpret_cast<OutT*>(out.data()),
+                  reinterpret_cast<const InT*>(input.data()), d_out_shape_,
+                  d_input_shape_, d_out_strides_, d_input_strides_,
+                  output_size_, ndim_, is_out_contiguous_,
+                  is_input_contiguous_);
+        },
+        "CudaCast::operator()");
+  }
+
+ private:
+  std::byte* d_metadata_{nullptr};
+
+  Tensor::Size* d_input_shape_{nullptr};
+
+  Tensor::Size* d_out_shape_{nullptr};
+
+  Tensor::Stride* d_input_strides_{nullptr};
+
+  Tensor::Stride* d_out_strides_{nullptr};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/iluvatar/cast/kernel.h b/src/cuda/iluvatar/cast/kernel.h
new file mode 100644
index 00000000..9843e446
--- /dev/null
+++ b/src/cuda/iluvatar/cast/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_ILUVATAR_CAST_KERNEL_H_
+#define INFINI_OPS_ILUVATAR_CAST_KERNEL_H_
+
+#include <utility>
+
+#include "cuda/cast/kernel.h"
+#include "cuda/iluvatar/caster.cuh"
+#include "cuda/iluvatar/runtime_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Cast, Device::Type::kIluvatar>
+    : public CudaCast<Runtime<Device::Type::kIluvatar>> {
+ public:
+  using CudaCast<Runtime<Device::Type::kIluvatar>>::CudaCast;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/metax/cast/kernel.h b/src/cuda/metax/cast/kernel.h
new file mode 100644
index 00000000..05735e7e
--- /dev/null
+++ b/src/cuda/metax/cast/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_METAX_CAST_KERNEL_H_
+#define INFINI_OPS_METAX_CAST_KERNEL_H_
+
+#include <utility>
+
+#include "cuda/cast/kernel.h"
+#include "cuda/metax/caster.cuh"
+#include "cuda/metax/runtime_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Cast, Device::Type::kMetax>
+    : public CudaCast<Runtime<Device::Type::kMetax>> {
+ public:
+  using CudaCast<Runtime<Device::Type::kMetax>>::CudaCast;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/moore/cast/kernel.h b/src/cuda/moore/cast/kernel.h
new file mode 100644
index 00000000..51dedf79
--- /dev/null
+++ b/src/cuda/moore/cast/kernel.h
@@ -0,0 +1,26 @@
+#ifndef INFINI_OPS_MOORE_CAST_KERNEL_H_
+#define INFINI_OPS_MOORE_CAST_KERNEL_H_
+
+#include <utility>
+
+// clang-format off
+#include "cuda/moore/polyfills.cuh"
+// clang-format on
+
+#include "cuda/cast/kernel.h"
+#include "cuda/moore/caster.cuh"
+#include "cuda/moore/polyfills.cuh"
+#include "cuda/moore/runtime_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Cast, Device::Type::kMoore>
+    : public CudaCast<Runtime<Device::Type::kMoore>> {
+ public:
+  using CudaCast<Runtime<Device::Type::kMoore>>::CudaCast;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/nvidia/cast/kernel.h b/src/cuda/nvidia/cast/kernel.h
new file mode 100644
index 00000000..0e69a031
--- /dev/null
+++ b/src/cuda/nvidia/cast/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_NVIDIA_CAST_KERNEL_H_
+#define INFINI_OPS_NVIDIA_CAST_KERNEL_H_
+
+#include <utility>
+
+#include "cuda/cast/kernel.h"
+#include "cuda/nvidia/caster.cuh"
+#include "cuda/nvidia/runtime_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Cast, Device::Type::kNvidia>
+    : public CudaCast<Runtime<Device::Type::kNvidia>> {
+ public:
+  using CudaCast<Runtime<Device::Type::kNvidia>>::CudaCast;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/tests/test_cast.py b/tests/test_cast.py
index bd19d934..0f2959a5 100644
--- a/tests/test_cast.py
+++ b/tests/test_cast.py
@@ -14,6 +14,8 @@
         ((13, 4, 4), None, None),
         ((16, 5632), None, None),
         ((4, 4, 5632), None, None),
+        ((0, 16), None, None),
+        ((4, 0, 8), None, None),
     ),
 )
 @pytest.mark.parametrize(