diff --git a/CMakeLists.txt b/CMakeLists.txt
index d1487af838..83e864586a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,7 @@ option(ENABLE_PEXSI "Enable support for PEXSI." OFF)
 option(ENABLE_CUSOLVERMP "Enable cusolvermp." OFF)
 option(USE_DSP "Enable DSP usage." OFF)
 option(USE_CUDA_ON_DCU "Enable CUDA on DCU" OFF)
+option(USE_CUDA_MPI "Enable CUDA-aware MPI" OFF)
 
 # enable json support
 if(ENABLE_RAPIDJSON)
@@ -132,6 +133,10 @@ if (USE_CUDA_ON_DCU)
   add_compile_definitions(__CUDA_ON_DCU)
 endif()
 
+if (USE_CUDA_MPI)
+  add_compile_definitions(__CUDA_MPI)
+endif()
+
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 if(ENABLE_COVERAGE)
diff --git a/docs/advanced/install.md b/docs/advanced/install.md
index 88714b2d7d..b6ccdc8b11 100644
--- a/docs/advanced/install.md
+++ b/docs/advanced/install.md
@@ -115,6 +115,8 @@ To build NVIDIA GPU support for ABACUS, define `USE_CUDA` flag. You can also spe
 cmake -B build -DUSE_CUDA=1 -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nvcc
 ```
 
+If you are confident that your MPI supports CUDA Aware, you can add `-DUSE_CUDA_MPI=ON`. In this case, the program will directly communicate data with the CUDA hardware, rather than transferring it to the CPU first before communication. But note that if CUDA Aware is not supported, adding `-DUSE_CUDA_MPI=ON` will cause the program to throw an error.
+
 ## Build math library from source
 
 > Note: We recommend using the latest available compiler sets, since they offer faster implementations of math functions.
diff --git a/source/module_base/para_gemm.cpp b/source/module_base/para_gemm.cpp
index 2181a5b84d..4f83fce9cb 100644
--- a/source/module_base/para_gemm.cpp
+++ b/source/module_base/para_gemm.cpp
@@ -2,6 +2,7 @@
 
 #include "kernels/math_kernel_op.h"
 #include "parallel_device.h"
+#include "module_base/timer.h"
 namespace ModuleBase
 {
 template <typename T, typename Device>
@@ -109,6 +110,7 @@ void PGemmCN<T, Device>::set_dimension(
 template <typename T, typename Device>
 void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T beta, T* C)
 {
+    ModuleBase::timer::tick("PGemmCN", "multiply");
 #ifdef __MPI
     if (this->col_nproc > 1)
     {
@@ -126,6 +128,7 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
     {
         multiply_single(alpha, A, B, beta, C);
     }
+    ModuleBase::timer::tick("PGemmCN", "multiply");
 }
 
 template <typename T, typename Device>
@@ -154,10 +157,12 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
 
     std::vector<T> B_tmp(max_colA * LDA);
     std::vector<T> isend_tmp;
+#ifndef __CUDA_MPI
     if (std::is_same<Device, base_device::DEVICE_GPU>::value)
     {
         isend_tmp.resize(max_colA * LDA);
     }
+#endif
     for (int ip = 0; ip < col_nproc; ip++)
     {
         if (col_rank != ip)
@@ -244,6 +249,13 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
 
     if (this->gatherC)
     {
+#ifdef __CUDA_MPI
+        if (this->row_nproc > 1)
+        {
+            Parallel_Common::reduce_data(C_local, size_C_local, row_world);
+        }
+        Parallel_Common::gatherv_data(C_local, size_C_local, C, recv_counts.data(), displs.data(), col_world);
+#else
         T* Cglobal_cpu = nullptr;
         T* Clocal_cpu = C_tmp.data();
         std::vector<T> cpu_tmp;
@@ -277,6 +289,7 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
         {
             syncmem_h2d_op()(C, Cglobal_cpu, size_C_global);
         }
+#endif
     }
     else
     {
diff --git a/source/module_base/parallel_device.cpp b/source/module_base/parallel_device.cpp
index b5ade6c56f..933064e248 100644
--- a/source/module_base/parallel_device.cpp
+++ b/source/module_base/parallel_device.cpp
@@ -99,6 +99,7 @@ void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::comple
     MPI_Allgatherv(sendbuf, sendcount, MPI_COMPLEX, recvbuf, recvcounts, displs, MPI_COMPLEX, comm);
 }
 
+#ifndef __CUDA_MPI
 template <typename T>
 struct object_cpu_point<T, base_device::DEVICE_GPU>
 {
@@ -171,6 +172,7 @@ template struct object_cpu_point<float, base_device::DEVICE_CPU>;
 template struct object_cpu_point<float, base_device::DEVICE_GPU>;
 template struct object_cpu_point<std::complex<float>, base_device::DEVICE_CPU>;
 template struct object_cpu_point<std::complex<float>, base_device::DEVICE_GPU>;
+#endif
 
 } // namespace Parallel_Common
 #endif
\ No newline at end of file
diff --git a/source/module_base/parallel_device.h b/source/module_base/parallel_device.h
index 5b225de9dc..fbf127d07d 100644
--- a/source/module_base/parallel_device.h
+++ b/source/module_base/parallel_device.h
@@ -32,6 +32,7 @@ void gatherv_data(const std::complex<double>* sendbuf, int sendcount, std::compl
 void gatherv_data(const float* sendbuf, int sendcount, float* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm);
 void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::complex<float>* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm);
 
+#ifndef __CUDA_MPI
 template<typename T, typename Device>
 struct object_cpu_point
 {
@@ -41,6 +42,7 @@ struct object_cpu_point
     void sync_d2h(T* object_cpu, const T* object, const int& n);
     void sync_h2d(T* object, const T* object_cpu, const int& n);
 };
+#endif
 
 /**
  * @brief send data in Device
@@ -49,11 +51,15 @@ struct object_cpu_point
 template <typename T, typename Device>
 void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T* tmp_space = nullptr)
 {
+#ifdef __CUDA_MPI
+    send_data(object, count, dest, tag, comm);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, count, tmp_space);
     o.sync_d2h(object_cpu, object, count);
     send_data(object_cpu, count, dest, tag, comm);
     o.del(object_cpu);
+#endif
     return;
 }
 
@@ -65,11 +71,15 @@ void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T*
 template <typename T, typename Device>
 void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MPI_Request* request, T* send_space)
 {
+#ifdef __CUDA_MPI
+    isend_data(object, count, dest, tag, comm, request);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, count, send_space);
     o.sync_d2h(object_cpu, object, count);
     isend_data(object_cpu, count, dest, tag, comm, request);
     o.del(object_cpu);
+#endif
     return;
 }
 
@@ -80,11 +90,15 @@ void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MP
 template <typename T, typename Device>
 void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Status* status, T* tmp_space = nullptr)
 {
+#ifdef __CUDA_MPI
+    recv_data(object, count, source, tag, comm, status);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, count, tmp_space);
     recv_data(object_cpu, count, source, tag, comm, status);
     o.sync_h2d(object, object_cpu, count);
     o.del(object_cpu);
+#endif
     return;
 }
 
@@ -102,24 +116,32 @@ void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Sta
 template <typename T, typename Device>
 void bcast_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
 {
+#ifdef __CUDA_MPI
+    bcast_data(object, n, comm);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, n, tmp_space);
     o.sync_d2h(object_cpu, object, n);
     bcast_data(object_cpu, n, comm);
     o.sync_h2d(object, object_cpu, n);
     o.del(object_cpu);
+#endif
     return;
 }
 
 template <typename T, typename Device>
 void reduce_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
 {
+#ifdef __CUDA_MPI
+    reduce_data(object, n, comm);
+#else
     object_cpu_point<T,Device> o;
     T* object_cpu = o.get(object, n, tmp_space);
     o.sync_d2h(object_cpu, object, n);
     reduce_data(object_cpu, n, comm);
     o.sync_h2d(object, object_cpu, n);
     o.del(object_cpu);
+#endif
     return;
 }
 }
diff --git a/source/module_hsolver/para_linear_transform.cpp b/source/module_hsolver/para_linear_transform.cpp
index 17e267101f..dd259e9b8a 100644
--- a/source/module_hsolver/para_linear_transform.cpp
+++ b/source/module_hsolver/para_linear_transform.cpp
@@ -1,4 +1,5 @@
 #include "para_linear_transform.h"
+#include "module_base/timer.h"
 
 #include <algorithm>
 #include <vector>
@@ -54,6 +55,7 @@ void PLinearTransform<T, Device>::set_dimension(const int nrowA,
 template <typename T, typename Device>
 void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, const T beta, T* B)
 {
+    ModuleBase::timer::tick("PLinearTransform", "act");
     const Device* ctx = {};
 #ifdef __MPI
     if (nproc_col > 1)
@@ -65,7 +67,9 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
         if (std::is_same<Device, base_device::DEVICE_GPU>::value)
         {
             A_tmp_device = nullptr;
+#ifndef __CUDA_MPI
             isend_tmp.resize(max_colA * LDA);
+#endif
             resmem_dev_op()(A_tmp_device, max_colA * LDA);
         }
         T* B_tmp = nullptr;
@@ -168,6 +172,7 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
                                          B,
                                          LDA);
     }
+    ModuleBase::timer::tick("PLinearTransform", "act");
 };
 
 template struct PLinearTransform<double, base_device::DEVICE_CPU>;