diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 6f127a1722..4a6e342041 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -87,6 +87,7 @@ if(USE_CUDA) source_base/kernels/cuda/math_kernel_op.cu source_base/kernels/cuda/math_kernel_op_vec.cu source_hamilt/module_xc/kernels/cuda/xc_functional_op.cu + source_pw/module_ofdft/kedf_wt_gpu.cu source_pw/module_pwdft/kernels/cuda/cal_density_real_op.cu source_pw/module_pwdft/kernels/cuda/mul_potential_op.cu source_pw/module_pwdft/kernels/cuda/vec_mul_vec_complex.cu diff --git a/source/source_pw/module_ofdft/kedf_wt.cpp b/source/source_pw/module_ofdft/kedf_wt.cpp index 21f4f1f2b4..bc3c93274d 100644 --- a/source/source_pw/module_ofdft/kedf_wt.cpp +++ b/source/source_pw/module_ofdft/kedf_wt.cpp @@ -457,6 +457,13 @@ double KEDF_WT::diff_linhard(double eta, double vw_weight) */ void KEDF_WT::multi_kernel(const double* const* prho, double** rkernel_rho, double exponent, ModulePW::PW_Basis* pw_rho) { +#ifdef __CUDA + if (pw_rho->get_device() == "gpu") { + this->multi_kernel_gpu(prho, rkernel_rho, exponent, pw_rho); + return; + } +#endif + std::complex** recipkernelRho = new std::complex*[PARAM.inp.nspin]; for (int is = 0; is < PARAM.inp.nspin; ++is) { diff --git a/source/source_pw/module_ofdft/kedf_wt.h b/source/source_pw/module_ofdft/kedf_wt.h index e41f3836f5..05a807ef5b 100644 --- a/source/source_pw/module_ofdft/kedf_wt.h +++ b/source/source_pw/module_ofdft/kedf_wt.h @@ -2,12 +2,17 @@ #define KEDF_WT_H #include #include +#include #include "source_base/global_function.h" #include "source_base/matrix.h" #include "source_base/timer.h" #include "source_basis/module_pw/pw_basis.h" +#ifdef __CUDA +#include +#endif + /** * @brief A class which calculates the kinetic energy, potential, and stress with Wang-Teter (WT) KEDF. * See Wang L W, Teter M P. Physical Review B, 1992, 45(23): 13196. @@ -22,6 +27,9 @@ class KEDF_WT } ~KEDF_WT() { +#ifdef __CUDA + this->free_gpu_buffers(); +#endif delete[] this->kernel_; } @@ -65,5 +73,20 @@ class KEDF_WT * 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2) double wt_coef_ = 0.; // coefficient of WT kernel double* kernel_ = nullptr; + +#ifdef __CUDA + void multi_kernel_gpu(const double* const* prho, double** rkernel_rho, + double exponent, ModulePW::PW_Basis* pw_rho); + void free_gpu_buffers(); + + // Persistent GPU buffers (lazily allocated once, reused across SCF iterations) + double* d_rho_ = nullptr; // real-space input (nrxx doubles) + cufftHandle cufft_plan_fwd_ = 0; // cuFFT forward plan + cufftHandle cufft_plan_bwd_ = 0; // cuFFT backward plan + double* d_result_ = nullptr; // real-space output (nrxx doubles) + double* d_kernel_ = nullptr; // WT kernel on device (npw doubles) + + bool gpu_allocated_ = false; +#endif }; -#endif \ No newline at end of file +#endif diff --git a/source/source_pw/module_ofdft/kedf_wt_gpu.cu b/source/source_pw/module_ofdft/kedf_wt_gpu.cu new file mode 100644 index 0000000000..4b6f9a9c02 --- /dev/null +++ b/source/source_pw/module_ofdft/kedf_wt_gpu.cu @@ -0,0 +1,161 @@ +/** + * @file kedf_wt_gpu.cu + * @brief GPU-accelerated WT KEDF multi_kernel convolution. + * + * Offloads the rho^exponent → FFT → kernel multiply → IFFT pipeline + * to GPU using cuFFT directly (bypassing PW_Basis template API to + * avoid cross-target template instantiation issues). + * + * Persistent GPU buffers are lazily allocated and reused across SCF. + * + * @author Wang Chenxi, Reze + * @date 2026-06 + */ +#include "kedf_wt.h" +#include "source_base/module_device/device_check.h" +#include "source_base/module_device/memory_op.h" +#include "source_io/module_parameter/parameter.h" + +#include +#include +#include + +namespace { + +constexpr int THREADS_PER_BLOCK = 256; + +/// Element-wise multiply: complex array *= real kernel. +__global__ void kedf_wt_recip_multiply( + thrust::complex* __restrict__ data, + const double* __restrict__ kernel, + int npw) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= npw) { return; } + data[idx] *= kernel[idx]; +} + +/// Real → complex conversion (imag = 0). +__global__ void kedf_wt_real_to_complex( + const double* __restrict__ src, + thrust::complex* __restrict__ dst, + int n) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) { return; } + dst[idx] = thrust::complex(src[idx], 0.0); +} + +/// Complex → real with 1/N normalization (cuFFT inverse doesn't normalize). +__global__ void kedf_wt_complex_to_real_norm( + const thrust::complex* __restrict__ src, + double* __restrict__ dst, + double inv_n, + int n) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) { return; } + dst[idx] = src[idx].real() * inv_n; +} + +/// cuFFT error check wrapper. +inline void cufft_check(cufftResult err, const char* file, int line) +{ + if (err != CUFFT_SUCCESS) { + std::cerr << "cuFFT error " << (int)err + << " at " << file << ":" << line << std::endl; + exit(1); + } +} +#define CUFFT_CHECK(call) cufft_check(call, __FILE__, __LINE__) + +} // anonymous namespace + +void KEDF_WT::multi_kernel_gpu( + const double* const* prho, + double** rkernel_rho, + double exponent, + ModulePW::PW_Basis* pw_rho) +{ + const int nrxx = pw_rho->nrxx; + const int npw = pw_rho->npw; + const int nx = pw_rho->nx; + const int ny = pw_rho->ny; + const int nz = pw_rho->nz; + const double inv_nrxx = 1.0 / nrxx; + + // ── Lazy allocation of persistent GPU buffers ── + if (!gpu_allocated_) { + resmem_dd_op()(d_rho_, nrxx); + resmem_dd_op()(d_result_, nrxx * 2); // reused as complex work buffer + resmem_dd_op()(d_kernel_, npw); + + syncmem_d2d_h2d_op()(d_kernel_, this->kernel_, npw); + + // Create cuFFT plans (3D Z2Z, in-place) + CUFFT_CHECK(cufftPlan3d(&cufft_plan_fwd_, nz, ny, nx, CUFFT_Z2Z)); + CUFFT_CHECK(cufftPlan3d(&cufft_plan_bwd_, nz, ny, nx, CUFFT_Z2Z)); + + gpu_allocated_ = true; + } + + const int blocks = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + // d_result_ is double* but reused as cuFFT complex buffer (nrxx*2 = nrxx complex doubles). + auto* d_fft = reinterpret_cast(d_result_); + + for (int is = 0; is < PARAM.inp.nspin; ++is) { + // Step 1: rho^exponent on CPU + for (int ir = 0; ir < nrxx; ++ir) { + rkernel_rho[is][ir] = std::pow(prho[is][ir], exponent); + } + + // Step 2: H → D + syncmem_d2d_h2d_op()(d_rho_, rkernel_rho[is], nrxx); + + // Step 3: Real → Complex on GPU + kedf_wt_real_to_complex<<>>( + d_rho_, + reinterpret_cast*>(d_fft), + nrxx); + CHECK_CUDA_SYNC(); + + // Step 4: Forward FFT (in-place on d_fft) + CUFFT_CHECK(cufftExecZ2Z(cufft_plan_fwd_, d_fft, d_fft, CUFFT_FORWARD)); + + // Step 5: Multiply by WT kernel in G-space + kedf_wt_recip_multiply<<>>( + reinterpret_cast*>(d_fft), + d_kernel_, + npw); + CHECK_CUDA_SYNC(); + + // Step 6: Inverse FFT (in-place on d_fft) + CUFFT_CHECK(cufftExecZ2Z(cufft_plan_bwd_, d_fft, d_fft, CUFFT_INVERSE)); + + // Step 7: Complex → Real with 1/N normalization + kedf_wt_complex_to_real_norm<<>>( + reinterpret_cast*>(d_fft), + d_result_, // back to double* usage + inv_nrxx, + nrxx); + CHECK_CUDA_SYNC(); + + // Step 8: D → H + syncmem_d2d_d2h_op()(rkernel_rho[is], d_result_, nrxx); + } +} + +void KEDF_WT::free_gpu_buffers() +{ + if (!gpu_allocated_) { return; } + + if (cufft_plan_fwd_ != 0) { cufftDestroy(cufft_plan_fwd_); cufft_plan_fwd_ = 0; } + if (cufft_plan_bwd_ != 0) { cufftDestroy(cufft_plan_bwd_); cufft_plan_bwd_ = 0; } + + if (d_rho_ != nullptr) { delmem_dd_op()(d_rho_); d_rho_ = nullptr; } + if (d_result_ != nullptr) { delmem_dd_op()(d_result_); d_result_ = nullptr; } + if (d_kernel_ != nullptr) { delmem_dd_op()(d_kernel_); d_kernel_ = nullptr; } + + gpu_allocated_ = false; +}