Skip to content

Commit 05a5d8c

Browse files
committed
WIP fix intel
1 parent 5149568 commit 05a5d8c

File tree

5 files changed

+157
-31
lines changed

5 files changed

+157
-31
lines changed

MatrixWrapper/CommunicationPattern/CommunicationPattern.C

+9
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ compute_send_recv_counts(const ExecutorHandler &exec_handler,
103103

104104
label tot_recv_elements{0};
105105
label comm_elements_buffer{0};
106+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
106107
if (rank == owner_rank) {
107108
// send and recv to it self
108109
recv_offsets[owner_rank] = padding_before;
@@ -111,6 +112,7 @@ compute_send_recv_counts(const ExecutorHandler &exec_handler,
111112
// the start of the next rank data
112113
tot_recv_elements = padding_before + size + padding_after;
113114

115+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
114116
for (int i = 1; i < ranks_per_gpu; i++) {
115117
// receive the recv counts
116118
comm.recv(exec, &comm_elements_buffer, 1, rank + i, rank);
@@ -166,6 +168,13 @@ void communicate_values (
166168
// send_buffer should be on the host
167169
// recv_buffer should be on the device
168170
// auto rank = comm.rank();
171+
std::cout
172+
<< __FILE__ << ":" << __LINE__
173+
<< " send_counts " << send_counts
174+
<< " recv_counts " << recv_counts
175+
<< " send_offsets " << send_offsets
176+
<< " recv_offsets " << recv_offsets
177+
<< "\n";
169178

170179
comm.all_to_all_v(exec, send_buffer, send_counts.data(),
171180
send_offsets.data(), recv_buffer, recv_counts.data(),

MatrixWrapper/Distributed/Distributed.H

+104-28
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,30 @@ public:
215215
local_sparsity_ = repart_loc_sparsity;
216216
non_local_sparsity_ = repart_non_loc_sparsity;
217217

218+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
219+
<< " build_localized_partition \n";
220+
//<< " dim " << local_sparsity_->dim[0] << " send idxs size "
221+
//<< dst_comm_pattern.send_idxs.size() << " target ids "
222+
//<< dst_comm_pattern.target_ids << " target sizes "
223+
//<< dst_comm_pattern.target_sizes << "\n";
224+
218225
auto localized_partition = local_part_type::build_from_blocked_recv(
219226
exec, local_sparsity_->dim[0], dst_comm_pattern->send_idxs,
220227
dst_comm_pattern->target_ids, dst_comm_pattern->target_sizes);
221228

229+
std::cout << __FILE__ << " rank " << rank << " local sparsity size "
230+
<< local_sparsity_->size_ << " local sparsity dim ["
231+
<< local_sparsity_->dim[0] << "x" << local_sparsity_->dim[1]
232+
<< "] non_local sparsity size " << non_local_sparsity_->size_
233+
<< " non local sparsity dim [" << non_local_sparsity_->dim[0]
234+
<< "x" << non_local_sparsity_->dim[1] << "] target_ids "
235+
<< dst_comm_pattern->target_ids << " target_sizes "
236+
<< dst_comm_pattern->target_sizes << " target_send_idxs.size "
237+
<< dst_comm_pattern->send_idxs.size()
238+
<< " non_local_sparsity.size " << non_local_sparsity_->size_
239+
<< " get_recv_indices "
240+
<< localized_partition->get_recv_indices().get_num_elems()
241+
<< " \n";
222242

223243
auto sparse_comm =
224244
sparse_communicator::create(comm, localized_partition);
@@ -264,11 +284,15 @@ public:
264284
non_local_sparsity_->row_idxs,
265285
non_local_sparsity_->col_idxs, non_local_coeffs),
266286
sparse_comm);
287+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
288+
<< " dnoe read distributed \n";
267289

268290

269291
update_impl(exec_handler, matrix_format, repartitioner, host_A, dist_A,
270292
local_sparsity_, non_local_sparsity_, src_comm_pattern,
271293
local_interfaces);
294+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
295+
<< " dnoe update impl \n";
272296

273297
auto ret = std::make_shared<RepartDistMatrix>(
274298
exec, comm, repartitioner.get_repart_dim(), dist_A->get_size(),
@@ -305,6 +329,8 @@ public:
305329
auto exec = exec_handler.get_ref_exec();
306330
auto device_exec = exec_handler.get_device_exec();
307331
auto ranks_per_gpu = repartitioner.get_ranks_per_gpu();
332+
bool requires_host_buffer = exec_handler.get_gko_force_host_buffer();
333+
308334
label rank{repartitioner.get_rank(exec_handler)};
309335
label owner_rank = repartitioner.get_owner_rank(exec_handler);
310336
bool owner = repartitioner.is_owner(exec_handler);
@@ -314,29 +340,43 @@ public:
314340
auto diag_comm_pattern = compute_send_recv_counts(
315341
exec_handler, ranks_per_gpu, nrows, local_matrix_nnz,
316342
local_matrix_nnz - nrows, 0);
343+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
344+
<< " diag comm pattern \n";
317345

318346

319347
label upper_nnz = host_A->get_upper_nnz();
320348
auto upper_comm_pattern = compute_send_recv_counts(
321349
exec_handler, ranks_per_gpu, upper_nnz, local_matrix_nnz, 0,
322350
local_matrix_nnz - upper_nnz);
351+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
352+
<< " upper comm pattern \n";
323353
auto lower_comm_pattern =
324354
compute_send_recv_counts(exec_handler, ranks_per_gpu, upper_nnz,
325355
local_matrix_nnz, upper_nnz, nrows);
326356

327357
scalar *local_ptr;
358+
scalar *local_ptr_2;
359+
label nnz=0;
328360

329361
// update main values
362+
std::vector<scalar> loc_buffer;
330363
if (owner) {
331364
using Coo = gko::matrix::Coo<scalar, label>;
332365
auto local_mtx = dist_A->get_local_matrix();
333366

367+
334368
std::shared_ptr<const Coo> local =
335369
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
336370
dist_A->get_local_matrix())
337371
->get_combination()
338372
->get_operators()[0]);
339-
local_ptr = const_cast<scalar *>(local->get_const_values());
373+
if (requires_host_buffer) {
374+
loc_buffer.resize(local->get_num_stored_elements());
375+
local_ptr = loc_buffer.data();
376+
local_ptr_2 = const_cast<scalar *>(local->get_const_values());
377+
} else {
378+
local_ptr = const_cast<scalar *>(local->get_const_values());
379+
}
340380
}
341381
communicate_values(exec_handler, diag_comm_pattern, host_A->get_diag(),
342382
local_ptr);
@@ -352,6 +392,18 @@ public:
352392
communicate_values(exec_handler, lower_comm_pattern,
353393
host_A->get_lower(), local_ptr);
354394
}
395+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
396+
<< " done comm local mtx \n";
397+
398+
if (requires_host_buffer) {
399+
auto host_buffer_view =
400+
gko::array<scalar>::view(exec, nnz, local_ptr);
401+
auto target_buffer_view =
402+
gko::array<scalar>::view(device_exec, nnz, local_ptr_2);
403+
target_buffer_view = host_buffer_view;
404+
}
405+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
406+
<< " done copy to device \n";
355407

356408
// copy interface values
357409
auto comm = *exec_handler.get_communicator().get();
@@ -364,6 +416,7 @@ public:
364416
label tag = 0;
365417
label comm_rank, comm_size;
366418
scalar *recv_buffer_ptr;
419+
std::vector<scalar> host_recv_buffer;
367420
label remain_host_interfaces = host_A->get_interface_size();
368421
for (auto [is_local, comm_rank] : local_interfaces) {
369422
label &ctr = (is_local) ? loc_ctr : nloc_ctr;
@@ -383,9 +436,18 @@ public:
383436
comm_size =
384437
non_local_sparsity->interface_spans[ctr].length();
385438
}
386-
recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());
439+
440+
if (requires_host_buffer) {
441+
host_recv_buffer.resize(comm_size);
442+
recv_buffer_ptr = host_recv_buffer.data();
443+
} else {
444+
recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());
445+
}
387446

388447
if (comm_rank != rank) {
448+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
449+
<< " comm_rank " << comm_rank << " rank " << rank << " \n";
450+
389451
comm.recv(exec, recv_buffer_ptr, comm_size, comm_rank, tag);
390452
} else {
391453
// if data is already on this rank
@@ -427,33 +489,47 @@ public:
427489
}
428490
}
429491

492+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
493+
<< " reorder \n";
430494
// reorder updated values
431-
if (owner) {
432-
// NOTE local sparsity size includes the interfaces
433-
using Coo = gko::matrix::Coo<scalar, label>;
434-
using dim_type = gko::dim<2>::dimension_type;
435-
std::shared_ptr<const Coo> local =
436-
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
437-
dist_A->get_local_matrix())
438-
->get_combination()
439-
->get_operators()[0]);
440-
auto local_elements = local->get_num_stored_elements();
441-
local_ptr = const_cast<scalar *>(local->get_const_values());
442-
// TODO make sure this doesn't copy
443-
// create a non owning dense matrix of local_values
444-
445-
auto row_collection = gko::share(gko::matrix::Dense<scalar>::create(
446-
exec, gko::dim<2>{static_cast<dim_type>(local_elements), 1},
447-
gko::array<scalar>::view(exec, local_elements, local_ptr), 1));
448-
449-
auto mapping_view = gko::array<label>::view(
450-
exec, local_elements, local_sparsity->ldu_mapping.get_data());
451-
452-
453-
// TODO this needs to copy ldu_mapping to the device
454-
auto dense_vec = row_collection->clone();
455-
dense_vec->row_gather(&mapping_view, row_collection.get());
456-
}
495+
if (owner) {
496+
// NOTE local sparsity size includes the interfaces
497+
using Coo = gko::matrix::Coo<scalar, label>;
498+
using dim_type = gko::dim<2>::dimension_type;
499+
std::shared_ptr<const Coo> local =
500+
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
501+
dist_A->get_local_matrix())
502+
->get_combination()
503+
->get_operators()[0]);
504+
auto local_elements = local->get_num_stored_elements();
505+
local_ptr = const_cast<scalar *>(local->get_const_values());
506+
// TODO make sure this doesn't copy
507+
// create a non owning dense matrix of local_values
508+
509+
auto row_collection = gko::share(gko::matrix::Dense<scalar>::create(
510+
device_exec, gko::dim<2>{static_cast<dim_type>(local_elements), 1},
511+
gko::array<scalar>::view(device_exec, local_elements, local_ptr), 1));
512+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << " local_elements " << local_elements
513+
<< " reorder \n";
514+
515+
auto mapping_view = gko::array<label>::view(
516+
exec, local_elements, local_sparsity->ldu_mapping.get_data());
517+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
518+
<< " reorder \n";
519+
520+
521+
// TODO this needs to copy ldu_mapping to the device
522+
auto dense_vec = row_collection->clone();
523+
//auto dense_vec = gko::share(gko::matrix::Dense<scalar>::create(exec, row_collection->get_size()));
524+
525+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
526+
<< " reorder \n";
527+
dense_vec->row_gather(&mapping_view, row_collection.get());
528+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
529+
<< " reorder \n";
530+
}
531+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
532+
<< "done reorder \n";
457533
};
458534

459535
RepartDistMatrix(

MatrixWrapper/SparsityPattern/SparsityPattern.H

+10
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,16 @@ struct SparsityPattern {
4747
rank(std::vector<label>{})
4848
{}
4949

50+
SparsityPattern(std::shared_ptr<const SparsityPattern> other)
51+
: size_(other->size_),
52+
row_idxs(other->row_idxs),
53+
col_idxs(other->col_idxs),
54+
ldu_mapping(other->ldu_mapping),
55+
dim(other->dim),
56+
interface_spans(other->interface_spans),
57+
rank(other->rank)
58+
{}
59+
5060
SparsityPattern(std::shared_ptr<const gko::Executor> exec, label size)
5161
: size_(size),
5262
row_idxs{exec, static_cast<gko::size_type>(size_)},

Preconditioner/Preconditioner.H

+1-1
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ public:
262262
auto smoother_gen = gko::share(
263263
ir::build()
264264
.with_solver(inner_solver_gen)
265-
.with_relaxation_factor(0.9)
265+
//.with_relaxation_factor(0.9)
266266
.with_criteria(
267267
gko::stop::Iteration::build().with_max_iters(2u).on(
268268
device_exec))

Repartitioner/Repartitioner.H

+33-2
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@ public:
214214
** signals whether this is a new local interface (no communication), and the
215215
** second entry (label) tracks the original rank of the interface
216216
*/
217-
std::tuple<std::shared_ptr<SparsityPattern>,
217+
std::tuple<
218+
std::shared_ptr<SparsityPattern>,
218219
std::shared_ptr<SparsityPattern>,
219220
std::vector<std::pair<bool, label>>>
220221
repartition_sparsity(
@@ -238,19 +239,44 @@ public:
238239
label rank = get_rank(exec_handler);
239240
label owner_rank = get_owner_rank(exec_handler);
240241
label ranks_per_gpu = ranks_per_gpu_;
242+
// TODO dont copy
243+
if (ranks_per_gpu == 1) {
244+
std::vector<std::pair<bool, label>> ret;
245+
for (auto comm_rank:src_non_local_pattern->rank) {
246+
ret.emplace_back(false, rank);
247+
}
248+
249+
250+
return std::make_tuple<
251+
std::shared_ptr<SparsityPattern>,
252+
std::shared_ptr<SparsityPattern>,
253+
std::vector<std::pair<bool, label>>>(
254+
std::make_shared<SparsityPattern>(src_local_pattern),
255+
std::make_shared<SparsityPattern>(src_non_local_pattern),
256+
std::move(ret)
257+
);
258+
}
241259

260+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
242261
auto local_comm_pattern = compute_send_recv_counts(
243262
exec_handler, ranks_per_gpu, src_local_pattern->size_);
263+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << " owner rank" << owner_rank<< "\n";
264+
244265

245-
label offset = orig_partition_->get_range_bounds()[rank] -
266+
label offset = 0;
267+
if (ranks_per_gpu != 1){
268+
offset = orig_partition_->get_range_bounds()[rank] -
246269
orig_partition_->get_range_bounds()[owner_rank];
270+
}
271+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
247272

248273
auto gather_closure = [&](auto &comm_pattern, auto &data,
249274
label offset) {
250275
return gather_to_owner(exec_handler, comm_pattern, data.get_size(),
251276
data.get_data(), offset);
252277
};
253278

279+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
254280
SparsityPatternVector merged_local{
255281
gather_closure(local_comm_pattern, src_local_pattern->row_idxs,
256282
offset),
@@ -267,6 +293,7 @@ public:
267293
make_ldu_mapping_consecutive(
268294
local_comm_pattern, merged_local.mapping, rank, ranks_per_gpu);
269295
}
296+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
270297

271298
label rows =
272299
(is_owner(exec_handler)) ? merged_local.rows.back() + 1 : 0;
@@ -284,6 +311,7 @@ public:
284311
spans_begin.push_back(elem.begin);
285312
spans_end.push_back(elem.end);
286313
}
314+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
287315

288316
// the non local cols are in local idx of other side
289317
// thus we need the new offset of the other side
@@ -300,6 +328,7 @@ public:
300328
std::transform(data, data + size, data,
301329
[&](label idx) { return idx + local_offset; });
302330
}
331+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
303332

304333
SparsityPatternVector merged_non_local{
305334
gather_closure(non_local_comm_pattern,
@@ -328,6 +357,7 @@ public:
328357
// build vector with locality information
329358
std::vector<std::pair<bool, label>> locality;
330359
label ctr{0};
360+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
331361

332362
if (is_owner(exec_handler)) {
333363
auto recv_counts = std::get<1>(span_comm_pattern);
@@ -354,6 +384,7 @@ public:
354384
gathered_non_local.cols[i] = i;
355385
}
356386

387+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
357388
LOG_1(verbose_, "done repartition sparsity pattern")
358389
if (is_owner(exec_handler)) {
359390
auto new_local_spars_pattern = std::make_shared<SparsityPattern>(

0 commit comments

Comments
 (0)