Performance: Add CUDA Aware MPI (#5930)

Qianruipku · web-flow · commit 9448ba94eb16 · 2025-02-26T02:44:17.000Z
* add CUDA-aware MPI
* update docs
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -42,6 +42,7 @@ option(ENABLE_PEXSI "Enable support for PEXSI." OFF)
 option(ENABLE_CUSOLVERMP "Enable cusolvermp." OFF)
 option(USE_DSP "Enable DSP usage." OFF)
 option(USE_CUDA_ON_DCU "Enable CUDA on DCU" OFF)
+option(USE_CUDA_MPI "Enable CUDA-aware MPI" OFF)
 
 # enable json support
 if(ENABLE_RAPIDJSON)
@@ -132,6 +133,10 @@ if (USE_CUDA_ON_DCU)
   add_compile_definitions(__CUDA_ON_DCU)
 endif()
 
+if (USE_CUDA_MPI)
+  add_compile_definitions(__CUDA_MPI)
+endif()
+
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 if(ENABLE_COVERAGE)
diff --git a/docs/advanced/install.md b/docs/advanced/install.md
@@ -115,6 +115,8 @@ To build NVIDIA GPU support for ABACUS, define `USE_CUDA` flag. You can also spe
 cmake -B build -DUSE_CUDA=1 -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nvcc
 ```
 
+If you are confident that your MPI supports CUDA Aware, you can add `-DUSE_CUDA_MPI=ON`. In this case, the program will directly communicate data with the CUDA hardware, rather than transferring it to the CPU first before communication. But note that if CUDA Aware is not supported, adding `-DUSE_CUDA_MPI=ON` will cause the program to throw an error.
+
 ## Build math library from source
 
 > Note: We recommend using the latest available compiler sets, since they offer faster implementations of math functions.
diff --git a/source/module_base/para_gemm.cpp b/source/module_base/para_gemm.cpp
@@ -2,6 +2,7 @@
 
 #include "kernels/math_kernel_op.h"
 #include "parallel_device.h"
+#include "module_base/timer.h"
 namespace ModuleBase
 {
 template <typename T, typename Device>
@@ -109,6 +110,7 @@ void PGemmCN<T, Device>::set_dimension(
 template <typename T, typename Device>
 void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T beta, T* C)
 {
+    ModuleBase::timer::tick("PGemmCN", "multiply");
 #ifdef __MPI
     if (this->col_nproc > 1)
     {
@@ -126,6 +128,7 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
     {
         multiply_single(alpha, A, B, beta, C);
     }
+    ModuleBase::timer::tick("PGemmCN", "multiply");
 }
 
 template <typename T, typename Device>
@@ -154,10 +157,12 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
 
     std::vector<T> B_tmp(max_colA * LDA);
     std::vector<T> isend_tmp;
+#ifndef __CUDA_MPI
     if (std::is_same<Device, base_device::DEVICE_GPU>::value)
     {
         isend_tmp.resize(max_colA * LDA);
     }
+#endif
     for (int ip = 0; ip < col_nproc; ip++)
     {
         if (col_rank != ip)
@@ -244,6 +249,13 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
 
     if (this->gatherC)
     {
+#ifdef __CUDA_MPI
+        if (this->row_nproc > 1)
+        {
+            Parallel_Common::reduce_data(C_local, size_C_local, row_world);
+        }
+        Parallel_Common::gatherv_data(C_local, size_C_local, C, recv_counts.data(), displs.data(), col_world);
+#else
         T* Cglobal_cpu = nullptr;
         T* Clocal_cpu = C_tmp.data();
         std::vector<T> cpu_tmp;
@@ -277,6 +289,7 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
         {
             syncmem_h2d_op()(C, Cglobal_cpu, size_C_global);
         }
+#endif
     }
     else
     {
diff --git a/source/module_base/parallel_device.cpp b/source/module_base/parallel_device.cpp
@@ -99,6 +99,7 @@ void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::comple
     MPI_Allgatherv(sendbuf, sendcount, MPI_COMPLEX, recvbuf, recvcounts, displs, MPI_COMPLEX, comm);
 }
 
+#ifndef __CUDA_MPI
 template <typename T>
 struct object_cpu_point<T, base_device::DEVICE_GPU>
 {
@@ -171,6 +172,7 @@ template struct object_cpu_point<float, base_device::DEVICE_CPU>;
 template struct object_cpu_point<float, base_device::DEVICE_GPU>;
 template struct object_cpu_point<std::complex<float>, base_device::DEVICE_CPU>;
 template struct object_cpu_point<std::complex<float>, base_device::DEVICE_GPU>;
+#endif
 
 } // namespace Parallel_Common
 #endif
diff --git a/source/module_base/parallel_device.h b/source/module_base/parallel_device.h
@@ -32,6 +32,7 @@ void gatherv_data(const std::complex<double>* sendbuf, int sendcount, std::compl
 void gatherv_data(const float* sendbuf, int sendcount, float* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm);
 void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::complex<float>* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm);
 
+#ifndef __CUDA_MPI
 template<typename T, typename Device>
 struct object_cpu_point
 {
@@ -41,6 +42,7 @@ struct object_cpu_point
     void sync_d2h(T* object_cpu, const T* object, const int& n);
     void sync_h2d(T* object, const T* object_cpu, const int& n);
 };
+#endif
 
 /**
  * @brief send data in Device
@@ -49,11 +51,15 @@ struct object_cpu_point
 template <typename T, typename Device>
 void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T* tmp_space = nullptr)
 {
+#ifdef __CUDA_MPI
+    send_data(object, count, dest, tag, comm);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, count, tmp_space);
     o.sync_d2h(object_cpu, object, count);
     send_data(object_cpu, count, dest, tag, comm);
     o.del(object_cpu);
+#endif
     return;
 }
 
@@ -65,11 +71,15 @@ void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T*
 template <typename T, typename Device>
 void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MPI_Request* request, T* send_space)
 {
+#ifdef __CUDA_MPI
+    isend_data(object, count, dest, tag, comm, request);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, count, send_space);
     o.sync_d2h(object_cpu, object, count);
     isend_data(object_cpu, count, dest, tag, comm, request);
     o.del(object_cpu);
+#endif
     return;
 }
 
@@ -80,11 +90,15 @@ void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MP
 template <typename T, typename Device>
 void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Status* status, T* tmp_space = nullptr)
 {
+#ifdef __CUDA_MPI
+    recv_data(object, count, source, tag, comm, status);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, count, tmp_space);
     recv_data(object_cpu, count, source, tag, comm, status);
     o.sync_h2d(object, object_cpu, count);
     o.del(object_cpu);
+#endif
     return;
 }
 
@@ -102,24 +116,32 @@ void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Sta
 template <typename T, typename Device>
 void bcast_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
 {
+#ifdef __CUDA_MPI
+    bcast_data(object, n, comm);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, n, tmp_space);
     o.sync_d2h(object_cpu, object, n);
     bcast_data(object_cpu, n, comm);
     o.sync_h2d(object, object_cpu, n);
     o.del(object_cpu);
+#endif
     return;
 }
 
 template <typename T, typename Device>
 void reduce_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
 {
+#ifdef __CUDA_MPI
+    reduce_data(object, n, comm);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, n, tmp_space);
     o.sync_d2h(object_cpu, object, n);
     reduce_data(object_cpu, n, comm);
     o.sync_h2d(object, object_cpu, n);
     o.del(object_cpu);
+#endif
     return;
 }
 }
diff --git a/source/module_hsolver/para_linear_transform.cpp b/source/module_hsolver/para_linear_transform.cpp
@@ -1,4 +1,5 @@
 #include "para_linear_transform.h"
+#include "module_base/timer.h"
 
 #include <algorithm>
 #include <vector>
@@ -54,6 +55,7 @@ void PLinearTransform<T, Device>::set_dimension(const int nrowA,
 template <typename T, typename Device>
 void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, const T beta, T* B)
 {
+    ModuleBase::timer::tick("PLinearTransform", "act");
     const Device* ctx = {};
 #ifdef __MPI
     if (nproc_col > 1)
@@ -65,7 +67,9 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
         if (std::is_same<Device, base_device::DEVICE_GPU>::value)
         {
             A_tmp_device = nullptr;
+#ifndef __CUDA_MPI
             isend_tmp.resize(max_colA * LDA);
+#endif
             resmem_dev_op()(A_tmp_device, max_colA * LDA);
         }
         T* B_tmp = nullptr;
@@ -168,6 +172,7 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
                                          B,
                                          LDA);
     }
+    ModuleBase::timer::tick("PLinearTransform", "act");
 };
 
 template struct PLinearTransform<double, base_device::DEVICE_CPU>;

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`
`3`	`3`	`#include "kernels/math_kernel_op.h"`
`4`	`4`	`#include "parallel_device.h"`
	`5`	`+#include "module_base/timer.h"`
`5`	`6`	`namespace ModuleBase`
`6`	`7`	`{`
`7`	`8`	`template <typename T, typename Device>`
`@@ -109,6 +110,7 @@ void PGemmCN<T, Device>::set_dimension(`
`109`	`110`	`template <typename T, typename Device>`
`110`	`111`	`void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T beta, T* C)`
`111`	`112`	`{`
	`113`	`+ ModuleBase::timer::tick("PGemmCN", "multiply");`
`112`	`114`	`#ifdef __MPI`
`113`	`115`	`if (this->col_nproc > 1)`
`114`	`116`	`{`
`@@ -126,6 +128,7 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T`
`126`	`128`	`{`
`127`	`129`	`multiply_single(alpha, A, B, beta, C);`
`128`	`130`	`}`
	`131`	`+ ModuleBase::timer::tick("PGemmCN", "multiply");`
`129`	`132`	`}`
`130`	`133`
`131`	`134`	`template <typename T, typename Device>`
`@@ -154,10 +157,12 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con`
`154`	`157`
`155`	`158`	`std::vector<T> B_tmp(max_colA * LDA);`
`156`	`159`	`std::vector<T> isend_tmp;`
	`160`	`+#ifndef __CUDA_MPI`
`157`	`161`	`if (std::is_same<Device, base_device::DEVICE_GPU>::value)`
`158`	`162`	`{`
`159`	`163`	`isend_tmp.resize(max_colA * LDA);`
`160`	`164`	`}`
	`165`	`+#endif`
`161`	`166`	`for (int ip = 0; ip < col_nproc; ip++)`
`162`	`167`	`{`
`163`	`168`	`if (col_rank != ip)`
`@@ -244,6 +249,13 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con`
`244`	`249`
`245`	`250`	`if (this->gatherC)`
`246`	`251`	`{`
	`252`	`+#ifdef __CUDA_MPI`
	`253`	`+ if (this->row_nproc > 1)`
	`254`	`+ {`
	`255`	`+ Parallel_Common::reduce_data(C_local, size_C_local, row_world);`
	`256`	`+ }`
	`257`	`+ Parallel_Common::gatherv_data(C_local, size_C_local, C, recv_counts.data(), displs.data(), col_world);`
	`258`	`+#else`
`247`	`259`	`T* Cglobal_cpu = nullptr;`
`248`	`260`	`T* Clocal_cpu = C_tmp.data();`
`249`	`261`	`std::vector<T> cpu_tmp;`
`@@ -277,6 +289,7 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con`
`277`	`289`	`{`
`278`	`290`	`syncmem_h2d_op()(C, Cglobal_cpu, size_C_global);`
`279`	`291`	`}`
	`292`	`+#endif`
`280`	`293`	`}`
`281`	`294`	`else`
`282`	`295`	`{`
Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,7 @@ void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::comple`
`99`	`99`	`MPI_Allgatherv(sendbuf, sendcount, MPI_COMPLEX, recvbuf, recvcounts, displs, MPI_COMPLEX, comm);`
`100`	`100`	`}`
`101`	`101`
	`102`	`+#ifndef __CUDA_MPI`
`102`	`103`	`template <typename T>`
`103`	`104`	`struct object_cpu_point<T, base_device::DEVICE_GPU>`
`104`	`105`	`{`
`@@ -171,6 +172,7 @@ template struct object_cpu_point<float, base_device::DEVICE_CPU>;`
`171`	`172`	`template struct object_cpu_point<float, base_device::DEVICE_GPU>;`
`172`	`173`	`template struct object_cpu_point<std::complex<float>, base_device::DEVICE_CPU>;`
`173`	`174`	`template struct object_cpu_point<std::complex<float>, base_device::DEVICE_GPU>;`
	`175`	`+#endif`
`174`	`176`
`175`	`177`	`} // namespace Parallel_Common`
`176`	`178`	`#endif`