Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance: Add CUDA Aware MPI #5930

Merged
merged 26 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fcef6cd
move kpar into read_input_item
Qianruipku Jan 21, 2025
23fc25e
add para_linear_transform_op
Qianruipku Jan 21, 2025
7c4410e
arrange the order in read_input
Qianruipku Jan 21, 2025
c5dc8ce
change name
Qianruipku Jan 22, 2025
21f7818
fix compile
Qianruipku Jan 22, 2025
f789080
Merge branch 'develop' of https://github.com/deepmodeling/abacus-deve…
Qianruipku Jan 22, 2025
c15c823
make bpcg support bndpar > 1
Qianruipku Jan 26, 2025
7ddd28f
fix BPCG
Qianruipku Jan 27, 2025
65e4ba1
fix bug in sDFT-BPCG
Qianruipku Jan 27, 2025
30bc4ad
make sdft+bpcg support GPU
Qianruipku Jan 27, 2025
ecff975
merge
Qianruipku Jan 27, 2025
0664852
update results
Qianruipku Jan 27, 2025
32a204a
fix bug in BPCG
Qianruipku Jan 27, 2025
117f4e8
Merge branch 'hotfix' of https://github.com/Qianruipku/abacus-develop…
Qianruipku Jan 27, 2025
6059cf0
fix tests
Qianruipku Jan 27, 2025
4941bad
fix test
Qianruipku Jan 28, 2025
6970f4c
update results
Qianruipku Jan 28, 2025
a1b041f
update results
Qianruipku Jan 30, 2025
a2513e1
Merge branch 'develop' of https://github.com/deepmodeling/abacus-deve…
Qianruipku Jan 30, 2025
1eeb767
[pre-commit.ci lite] apply automatic fixes
pre-commit-ci-lite[bot] Jan 30, 2025
8b86c7f
update
Qianruipku Jan 31, 2025
ebc39d8
Merge branch 'hotfix' of https://github.com/Qianruipku/abacus-develop…
Qianruipku Jan 31, 2025
da194b9
add CUDA-aware MPI
Qianruipku Feb 25, 2025
35285b8
merge
Qianruipku Feb 25, 2025
a4c050b
update docs
Qianruipku Feb 26, 2025
6955435
Merge branch 'develop' of https://github.com/deepmodeling/abacus-deve…
Qianruipku Feb 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ option(ENABLE_PEXSI "Enable support for PEXSI." OFF)
option(ENABLE_CUSOLVERMP "Enable cusolvermp." OFF)
option(USE_DSP "Enable DSP usage." OFF)
option(USE_CUDA_ON_DCU "Enable CUDA on DCU" OFF)
option(USE_CUDA_MPI "Enable CUDA-aware MPI" OFF)

# enable json support
if(ENABLE_RAPIDJSON)
Expand Down Expand Up @@ -132,6 +133,10 @@ if (USE_CUDA_ON_DCU)
add_compile_definitions(__CUDA_ON_DCU)
endif()

if (USE_CUDA_MPI)
add_compile_definitions(__CUDA_MPI)
endif()

list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

if(ENABLE_COVERAGE)
Expand Down
13 changes: 13 additions & 0 deletions source/module_base/para_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "kernels/math_kernel_op.h"
#include "parallel_device.h"
#include "module_base/timer.h"
namespace ModuleBase
{
template <typename T, typename Device>
Expand Down Expand Up @@ -109,6 +110,7 @@ void PGemmCN<T, Device>::set_dimension(
template <typename T, typename Device>
void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T beta, T* C)
{
ModuleBase::timer::tick("PGemmCN", "multiply");
#ifdef __MPI
if (this->col_nproc > 1)
{
Expand All @@ -126,6 +128,7 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
{
multiply_single(alpha, A, B, beta, C);
}
ModuleBase::timer::tick("PGemmCN", "multiply");
}

template <typename T, typename Device>
Expand Down Expand Up @@ -154,10 +157,12 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con

std::vector<T> B_tmp(max_colA * LDA);
std::vector<T> isend_tmp;
#ifndef __CUDA_MPI
if (std::is_same<Device, base_device::DEVICE_GPU>::value)
{
isend_tmp.resize(max_colA * LDA);
}
#endif
for (int ip = 0; ip < col_nproc; ip++)
{
if (col_rank != ip)
Expand Down Expand Up @@ -244,6 +249,13 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con

if (this->gatherC)
{
#ifdef __CUDA_MPI
if (this->row_nproc > 1)
{
Parallel_Common::reduce_data(C_local, size_C_local, row_world);
}
Parallel_Common::gatherv_data(C_local, size_C_local, C, recv_counts.data(), displs.data(), col_world);
#else
T* Cglobal_cpu = nullptr;
T* Clocal_cpu = C_tmp.data();
std::vector<T> cpu_tmp;
Expand Down Expand Up @@ -277,6 +289,7 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
{
syncmem_h2d_op()(C, Cglobal_cpu, size_C_global);
}
#endif
}
else
{
Expand Down
2 changes: 2 additions & 0 deletions source/module_base/parallel_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::comple
MPI_Allgatherv(sendbuf, sendcount, MPI_COMPLEX, recvbuf, recvcounts, displs, MPI_COMPLEX, comm);
}

#ifndef __CUDA_MPI
template <typename T>
struct object_cpu_point<T, base_device::DEVICE_GPU>
{
Expand Down Expand Up @@ -171,6 +172,7 @@ template struct object_cpu_point<float, base_device::DEVICE_CPU>;
template struct object_cpu_point<float, base_device::DEVICE_GPU>;
template struct object_cpu_point<std::complex<float>, base_device::DEVICE_CPU>;
template struct object_cpu_point<std::complex<float>, base_device::DEVICE_GPU>;
#endif

} // namespace Parallel_Common
#endif
22 changes: 22 additions & 0 deletions source/module_base/parallel_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void gatherv_data(const std::complex<double>* sendbuf, int sendcount, std::compl
void gatherv_data(const float* sendbuf, int sendcount, float* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm);
void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::complex<float>* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm);

#ifndef __CUDA_MPI
template<typename T, typename Device>
struct object_cpu_point
{
Expand All @@ -41,6 +42,7 @@ struct object_cpu_point
void sync_d2h(T* object_cpu, const T* object, const int& n);
void sync_h2d(T* object, const T* object_cpu, const int& n);
};
#endif

/**
* @brief send data in Device
Expand All @@ -49,11 +51,15 @@ struct object_cpu_point
template <typename T, typename Device>
void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T* tmp_space = nullptr)
{
#ifdef __CUDA_MPI
send_data(object, count, dest, tag, comm);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, count, tmp_space);
o.sync_d2h(object_cpu, object, count);
send_data(object_cpu, count, dest, tag, comm);
o.del(object_cpu);
#endif
return;
}

Expand All @@ -65,11 +71,15 @@ void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T*
template <typename T, typename Device>
void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MPI_Request* request, T* send_space)
{
#ifdef __CUDA_MPI
isend_data(object, count, dest, tag, comm, request);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, count, send_space);
o.sync_d2h(object_cpu, object, count);
isend_data(object_cpu, count, dest, tag, comm, request);
o.del(object_cpu);
#endif
return;
}

Expand All @@ -80,11 +90,15 @@ void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MP
template <typename T, typename Device>
void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Status* status, T* tmp_space = nullptr)
{
#ifdef __CUDA_MPI
recv_data(object, count, source, tag, comm, status);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, count, tmp_space);
recv_data(object_cpu, count, source, tag, comm, status);
o.sync_h2d(object, object_cpu, count);
o.del(object_cpu);
#endif
return;
}

Expand All @@ -102,24 +116,32 @@ void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Sta
template <typename T, typename Device>
void bcast_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
{
#ifdef __CUDA_MPI
bcast_data(object, n, comm);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, n, tmp_space);
o.sync_d2h(object_cpu, object, n);
bcast_data(object_cpu, n, comm);
o.sync_h2d(object, object_cpu, n);
o.del(object_cpu);
#endif
return;
}

template <typename T, typename Device>
void reduce_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
{
#ifdef __CUDA_MPI
reduce_data(object, n, comm);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, n, tmp_space);
o.sync_d2h(object_cpu, object, n);
reduce_data(object_cpu, n, comm);
o.sync_h2d(object, object_cpu, n);
o.del(object_cpu);
#endif
return;
}
}
Expand Down
5 changes: 5 additions & 0 deletions source/module_hsolver/para_linear_transform.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "para_linear_transform.h"
#include "module_base/timer.h"

#include <algorithm>
#include <vector>
Expand Down Expand Up @@ -54,6 +55,7 @@ void PLinearTransform<T, Device>::set_dimension(const int nrowA,
template <typename T, typename Device>
void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, const T beta, T* B)
{
ModuleBase::timer::tick("PLinearTransform", "act");
const Device* ctx = {};
#ifdef __MPI
if (nproc_col > 1)
Expand All @@ -65,7 +67,9 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
if (std::is_same<Device, base_device::DEVICE_GPU>::value)
{
A_tmp_device = nullptr;
#ifndef __CUDA_MPI
isend_tmp.resize(max_colA * LDA);
#endif
resmem_dev_op()(A_tmp_device, max_colA * LDA);
}
T* B_tmp = nullptr;
Expand Down Expand Up @@ -168,6 +172,7 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
B,
LDA);
}
ModuleBase::timer::tick("PLinearTransform", "act");
};

template struct PLinearTransform<double, base_device::DEVICE_CPU>;
Expand Down
Loading