Skip to content

Commit c117925

Browse files
Parallel-Split MPI_Alltoall algorithm as part of acoll collective.
-A new parallel-split algorithm for MPI_Alltoall is introduced as part of acoll collective component, primarily targeting smaller message sizes (<= 4KB). The algorithm, at a high level, operates by diving the ranks into n groups, performing alltoall (using a base alltoall routine) within the n groups in parallel, following which data is exchanged between groups of n adjacent ranks (starting from rank 0). For example if n=2, this algorithm splits the ranks into 2 groups, one containing all even ranked processes and another containing all odd ranked processes. Alltoall is performed within these 2 groups in parallel, followed by which each adjacent even-odd pairs (pairs being [0,1], [2,3],..) exchanges data to complete Alltoall operation. If n =4 or n=8, alltoall is performed within 4 or 8 groups in parallel. Following this step, groups of adjacent 4 or 8 ranks(starting from 0) exchanges data among themselves to complete the alltoall operation. -Additionally for intra node cases, an xpmem based linear algorithm for MPI_Alltoall is added as part of acoll. When sbuf and rbuf can be exposed via xpmem, the alltoall algorithm can be implemented as linear direct copy from sbuf of all the other ranks to rbuf of a given rank. Signed-off-by: Mithun Mohan <[email protected]>
1 parent c71d630 commit c117925

6 files changed

+807
-0
lines changed

ompi/mca/coll/acoll/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ sources = \
1515
coll_acoll_allgather.c \
1616
coll_acoll_bcast.c \
1717
coll_acoll_gather.c \
18+
coll_acoll_alltoall.c \
1819
coll_acoll_reduce.c \
1920
coll_acoll_allreduce.c \
2021
coll_acoll_barrier.c \

ompi/mca/coll/acoll/coll_acoll.h

+22
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,13 @@ int mca_coll_acoll_gather_intra(const void *sbuf, size_t scount, struct ompi_dat
6666
void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype, int root,
6767
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);
6868

69+
int mca_coll_acoll_alltoall(const void *sbuf, size_t scount,
70+
struct ompi_datatype_t *sdtype,
71+
void* rbuf, size_t rcount,
72+
struct ompi_datatype_t *rdtype,
73+
struct ompi_communicator_t *comm,
74+
mca_coll_base_module_t *module);
75+
6976
int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
7077
struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root,
7178
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);
@@ -80,6 +87,7 @@ int mca_coll_acoll_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base
8087
END_C_DECLS
8188

8289
#define MCA_COLL_ACOLL_ROOT_CHANGE_THRESH 10
90+
#define MCA_COLL_ACOLL_SPLIT_FACTOR_LIST_LEN 3
8391

8492
typedef enum MCA_COLL_ACOLL_SG_SIZES {
8593
MCA_COLL_ACOLL_SG_SIZE_1 = 8,
@@ -142,6 +150,18 @@ typedef struct coll_acoll_data {
142150
int sync[2];
143151
} coll_acoll_data_t;
144152

153+
/* The enum literals are used as indices into arrays and values are
154+
* assigned to the enum literals so as to ensure it is valid irrespective
155+
* of what the compiler assigns. */
156+
typedef enum MCA_COLL_ACOLL_R2R_DIST {
157+
DIST_CORE = 0,
158+
DIST_L3CACHE,
159+
DIST_NUMA,
160+
DIST_SOCKET,
161+
DIST_NODE,
162+
DIST_END
163+
} MCA_COLL_ACOLL_R2R_DIST_T;
164+
145165
typedef struct coll_acoll_subcomms {
146166
ompi_communicator_t *local_comm;
147167
ompi_communicator_t *local_r_comm;
@@ -152,6 +172,7 @@ typedef struct coll_acoll_subcomms {
152172
ompi_communicator_t *orig_comm;
153173
ompi_communicator_t *socket_comm;
154174
ompi_communicator_t *socket_ldr_comm;
175+
ompi_communicator_t *split_comm[MCA_COLL_ACOLL_SPLIT_FACTOR_LIST_LEN]; // AllToAll odd even split comm
155176
int num_nodes;
156177
int derived_node_size;
157178
int is_root_node;
@@ -170,6 +191,7 @@ typedef struct coll_acoll_subcomms {
170191
int initialized;
171192
int prev_init_root;
172193
int num_root_change;
194+
MCA_COLL_ACOLL_R2R_DIST_T r2r_dist;
173195

174196
ompi_communicator_t *numa_comm_ldrs;
175197
ompi_communicator_t *node_comm;

0 commit comments

Comments
 (0)