diff --git a/CMakeLists.txt b/CMakeLists.txt index d1487af838..83e864586a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,7 @@ option(ENABLE_PEXSI "Enable support for PEXSI." OFF) option(ENABLE_CUSOLVERMP "Enable cusolvermp." OFF) option(USE_DSP "Enable DSP usage." OFF) option(USE_CUDA_ON_DCU "Enable CUDA on DCU" OFF) +option(USE_CUDA_MPI "Enable CUDA-aware MPI" OFF) # enable json support if(ENABLE_RAPIDJSON) @@ -132,6 +133,10 @@ if (USE_CUDA_ON_DCU) add_compile_definitions(__CUDA_ON_DCU) endif() +if (USE_CUDA_MPI) + add_compile_definitions(__CUDA_MPI) +endif() + list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) if(ENABLE_COVERAGE) diff --git a/docs/advanced/install.md b/docs/advanced/install.md index 88714b2d7d..b6ccdc8b11 100644 --- a/docs/advanced/install.md +++ b/docs/advanced/install.md @@ -115,6 +115,8 @@ To build NVIDIA GPU support for ABACUS, define `USE_CUDA` flag. You can also spe cmake -B build -DUSE_CUDA=1 -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nvcc ``` +If you are confident that your MPI supports CUDA Aware, you can add `-DUSE_CUDA_MPI=ON`. In this case, the program will directly communicate data with the CUDA hardware, rather than transferring it to the CPU first before communication. But note that if CUDA Aware is not supported, adding `-DUSE_CUDA_MPI=ON` will cause the program to throw an error. + ## Build math library from source > Note: We recommend using the latest available compiler sets, since they offer faster implementations of math functions. diff --git a/source/module_base/para_gemm.cpp b/source/module_base/para_gemm.cpp index 2181a5b84d..4f83fce9cb 100644 --- a/source/module_base/para_gemm.cpp +++ b/source/module_base/para_gemm.cpp @@ -2,6 +2,7 @@ #include "kernels/math_kernel_op.h" #include "parallel_device.h" +#include "module_base/timer.h" namespace ModuleBase { template <typename T, typename Device> @@ -109,6 +110,7 @@ void PGemmCN<T, Device>::set_dimension( template <typename T, typename Device> void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T beta, T* C) { + ModuleBase::timer::tick("PGemmCN", "multiply"); #ifdef __MPI if (this->col_nproc > 1) { @@ -126,6 +128,7 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T { multiply_single(alpha, A, B, beta, C); } + ModuleBase::timer::tick("PGemmCN", "multiply"); } template <typename T, typename Device> @@ -154,10 +157,12 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con std::vector<T> B_tmp(max_colA * LDA); std::vector<T> isend_tmp; +#ifndef __CUDA_MPI if (std::is_same<Device, base_device::DEVICE_GPU>::value) { isend_tmp.resize(max_colA * LDA); } +#endif for (int ip = 0; ip < col_nproc; ip++) { if (col_rank != ip) @@ -244,6 +249,13 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con if (this->gatherC) { +#ifdef __CUDA_MPI + if (this->row_nproc > 1) + { + Parallel_Common::reduce_data(C_local, size_C_local, row_world); + } + Parallel_Common::gatherv_data(C_local, size_C_local, C, recv_counts.data(), displs.data(), col_world); +#else T* Cglobal_cpu = nullptr; T* Clocal_cpu = C_tmp.data(); std::vector<T> cpu_tmp; @@ -277,6 +289,7 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con { syncmem_h2d_op()(C, Cglobal_cpu, size_C_global); } +#endif } else { diff --git a/source/module_base/parallel_device.cpp b/source/module_base/parallel_device.cpp index b5ade6c56f..933064e248 100644 --- a/source/module_base/parallel_device.cpp +++ b/source/module_base/parallel_device.cpp @@ -99,6 +99,7 @@ void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::comple MPI_Allgatherv(sendbuf, sendcount, MPI_COMPLEX, recvbuf, recvcounts, displs, MPI_COMPLEX, comm); } +#ifndef __CUDA_MPI template <typename T> struct object_cpu_point<T, base_device::DEVICE_GPU> { @@ -171,6 +172,7 @@ template struct object_cpu_point<float, base_device::DEVICE_CPU>; template struct object_cpu_point<float, base_device::DEVICE_GPU>; template struct object_cpu_point<std::complex<float>, base_device::DEVICE_CPU>; template struct object_cpu_point<std::complex<float>, base_device::DEVICE_GPU>; +#endif } // namespace Parallel_Common #endif \ No newline at end of file diff --git a/source/module_base/parallel_device.h b/source/module_base/parallel_device.h index 5b225de9dc..fbf127d07d 100644 --- a/source/module_base/parallel_device.h +++ b/source/module_base/parallel_device.h @@ -32,6 +32,7 @@ void gatherv_data(const std::complex<double>* sendbuf, int sendcount, std::compl void gatherv_data(const float* sendbuf, int sendcount, float* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm); void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::complex<float>* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm); +#ifndef __CUDA_MPI template<typename T, typename Device> struct object_cpu_point { @@ -41,6 +42,7 @@ struct object_cpu_point void sync_d2h(T* object_cpu, const T* object, const int& n); void sync_h2d(T* object, const T* object_cpu, const int& n); }; +#endif /** * @brief send data in Device @@ -49,11 +51,15 @@ struct object_cpu_point template <typename T, typename Device> void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T* tmp_space = nullptr) { +#ifdef __CUDA_MPI + send_data(object, count, dest, tag, comm); +#else object_cpu_point<T,Device> o; T* object_cpu = o.get(object, count, tmp_space); o.sync_d2h(object_cpu, object, count); send_data(object_cpu, count, dest, tag, comm); o.del(object_cpu); +#endif return; } @@ -65,11 +71,15 @@ void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T* template <typename T, typename Device> void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MPI_Request* request, T* send_space) { +#ifdef __CUDA_MPI + isend_data(object, count, dest, tag, comm, request); +#else object_cpu_point<T,Device> o; T* object_cpu = o.get(object, count, send_space); o.sync_d2h(object_cpu, object, count); isend_data(object_cpu, count, dest, tag, comm, request); o.del(object_cpu); +#endif return; } @@ -80,11 +90,15 @@ void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MP template <typename T, typename Device> void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Status* status, T* tmp_space = nullptr) { +#ifdef __CUDA_MPI + recv_data(object, count, source, tag, comm, status); +#else object_cpu_point<T,Device> o; T* object_cpu = o.get(object, count, tmp_space); recv_data(object_cpu, count, source, tag, comm, status); o.sync_h2d(object, object_cpu, count); o.del(object_cpu); +#endif return; } @@ -102,24 +116,32 @@ void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Sta template <typename T, typename Device> void bcast_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr) { +#ifdef __CUDA_MPI + bcast_data(object, n, comm); +#else object_cpu_point<T,Device> o; T* object_cpu = o.get(object, n, tmp_space); o.sync_d2h(object_cpu, object, n); bcast_data(object_cpu, n, comm); o.sync_h2d(object, object_cpu, n); o.del(object_cpu); +#endif return; } template <typename T, typename Device> void reduce_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr) { +#ifdef __CUDA_MPI + reduce_data(object, n, comm); +#else object_cpu_point<T,Device> o; T* object_cpu = o.get(object, n, tmp_space); o.sync_d2h(object_cpu, object, n); reduce_data(object_cpu, n, comm); o.sync_h2d(object, object_cpu, n); o.del(object_cpu); +#endif return; } } diff --git a/source/module_hsolver/para_linear_transform.cpp b/source/module_hsolver/para_linear_transform.cpp index 17e267101f..dd259e9b8a 100644 --- a/source/module_hsolver/para_linear_transform.cpp +++ b/source/module_hsolver/para_linear_transform.cpp @@ -1,4 +1,5 @@ #include "para_linear_transform.h" +#include "module_base/timer.h" #include <algorithm> #include <vector> @@ -54,6 +55,7 @@ void PLinearTransform<T, Device>::set_dimension(const int nrowA, template <typename T, typename Device> void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, const T beta, T* B) { + ModuleBase::timer::tick("PLinearTransform", "act"); const Device* ctx = {}; #ifdef __MPI if (nproc_col > 1) @@ -65,7 +67,9 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con if (std::is_same<Device, base_device::DEVICE_GPU>::value) { A_tmp_device = nullptr; +#ifndef __CUDA_MPI isend_tmp.resize(max_colA * LDA); +#endif resmem_dev_op()(A_tmp_device, max_colA * LDA); } T* B_tmp = nullptr; @@ -168,6 +172,7 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con B, LDA); } + ModuleBase::timer::tick("PLinearTransform", "act"); }; template struct PLinearTransform<double, base_device::DEVICE_CPU>;