Skip to content

Commit e582bf1

Browse files
committed
WIP fix intel
1 parent 5149568 commit e582bf1

File tree

9 files changed

+204
-61
lines changed

9 files changed

+204
-61
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ if((NOT OGL_ALLOW_REFERENCE_ONLY) AND (NOT OGL_USE_EXTERNAL_GINKGO))
7474
endif()
7575

7676
set(GINKGO_CHECKOUT_VERSION
77-
"sparse-communicator-cmake-modernization-combination-interface-ogl-rebase"
77+
"ogl_rebase_1.8.0"
7878
CACHE STRING "Use specific version of ginkgo")
7979

8080
include(CheckIncludeFileCXX)

DevicePersistent/Vector/Vector.H

+32-11
Original file line numberDiff line numberDiff line change
@@ -22,23 +22,24 @@ namespace Foam {
2222
template <class T>
2323
struct VectorInitFunctor {
2424
using vec = gko::matrix::Dense<scalar>;
25-
using dist_vec = gko::experimental::distributed::Vector<scalar>;
26-
27-
const word name_;
2825

29-
const Repartitioner &repartitioner_;
26+
using dist_vec = gko::experimental::distributed::Vector<scalar>;
3027

3128
const ExecutorHandler &exec_;
3229

33-
const label verbose_;
30+
const word name_;
3431

35-
const bool on_device_;
32+
const Repartitioner &repartitioner_;
3633

3734
// Memory from which array will be initialised
3835
const T *other_;
3936

4037
const label size_;
4138

39+
const label verbose_;
40+
41+
const bool on_device_;
42+
4243
VectorInitFunctor(const ExecutorHandler &exec, const word name,
4344
const Repartitioner &repartitioner, const T *other,
4445
const label size, const label verbose,
@@ -82,20 +83,22 @@ struct VectorInitFunctor {
8283
auto host_view =
8384
gko::array<T>::const_view(exec_.get_ref_exec(), local_size, other_);
8485

86+
8587
// FIXME this shouldn't be needed here also not during an
8688
// update
8789
auto comm_pattern = compute_send_recv_counts(
8890
exec_, repartitioner_.get_ranks_per_gpu(), local_size);
8991

90-
// this should happen on device
9192
auto local_coeffs = gko::array<scalar>(exec, repart_size);
9293

9394
communicate_values(exec_, comm_pattern, host_view.get_const_data(),
9495
local_coeffs.get_data());
9596

9697
auto ret = gko::share(dist_vec::create(
97-
exec, *comm.get(),
98+
exec_.get_device_exec(), *comm.get(),
9899
vec::create(exec, gko::dim<2>{repart_size, 1}, local_coeffs, 1)));
100+
101+
99102
return ret;
100103
}
101104
};
@@ -175,9 +178,27 @@ public:
175178
// << " comm_pattern recv_offs " << std::get<3>(comm_pattern)
176179
// << "\n";
177180

178-
communicate_values(exec_, comm_pattern,
179-
get_vector()->get_local_values(),
180-
const_cast<T *>(memory_));
181+
std::vector<scalar> host_send_buffer;
182+
scalar *send_ptr;
183+
184+
if (true){
185+
label repart_size = repartitioner_.get_repart_size();
186+
host_send_buffer.resize(repart_size);
187+
send_ptr = host_send_buffer.data();
188+
auto host_exec = exec_.get_ref_exec();
189+
auto device_exec = exec_.get_device_exec();
190+
auto host_buffer_view =
191+
gko::array<scalar>::view(host_exec, repart_size, send_ptr);
192+
auto target_buffer_view =
193+
gko::array<scalar>::view(device_exec, repart_size, get_vector()->get_local_values()
194+
);
195+
target_buffer_view = host_buffer_view;
196+
} else {
197+
send_ptr = get_vector()->get_local_values();
198+
}
199+
200+
communicate_values(exec_, comm_pattern, send_ptr,
201+
const_cast<T *>(memory_));
181202
}
182203

183204
/** Writes the content of the distributed vector to disk

MatrixWrapper/Combination/Combination.H

-3
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@ public:
3333

3434
void read(const device_matrix_data &data) override
3535
{
36-
// TODO FIXME if no matrix is given because of repartitioning
37-
// which might remove non_local matrix
38-
// a zero sized dummy matrix needs to be created
3936
auto exec = this->get_executor();
4037
auto data_exec = data.get_executor();
4138

MatrixWrapper/CommunicationPattern/CommunicationPattern.C

+10-1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ compute_send_recv_counts(const ExecutorHandler &exec_handler,
103103

104104
label tot_recv_elements{0};
105105
label comm_elements_buffer{0};
106+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
106107
if (rank == owner_rank) {
107108
// send and recv to it self
108109
recv_offsets[owner_rank] = padding_before;
@@ -111,6 +112,7 @@ compute_send_recv_counts(const ExecutorHandler &exec_handler,
111112
// the start of the next rank data
112113
tot_recv_elements = padding_before + size + padding_after;
113114

115+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
114116
for (int i = 1; i < ranks_per_gpu; i++) {
115117
// receive the recv counts
116118
comm.recv(exec, &comm_elements_buffer, 1, rank + i, rank);
@@ -165,7 +167,14 @@ void communicate_values (
165167
//
166168
// send_buffer should be on the host
167169
// recv_buffer should be on the device
168-
// auto rank = comm.rank();
170+
// oensrtauto rank = comm.rank();
171+
// std:::cout
172+
// << __FILE__ << ":" << __LINE__
173+
// << " send_counts " << send_counts
174+
// << " recv_counts " << recv_counts
175+
// << " send_offsets " << send_offsets
176+
// << " recv_offsets " << recv_offsets
177+
// << "\n";
169178

170179
comm.all_to_all_v(exec, send_buffer, send_counts.data(),
171180
send_offsets.data(), recv_buffer, recv_counts.data(),

MatrixWrapper/Distributed/Distributed.H

+113-39
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ public:
146146
// "for performance reasons"
147147
// << abort(FatalError);
148148
gko::experimental::EnableDistributedLinOp<
149-
RepartDistMatrix>::operator=(std::move(other));
149+
RepartDistMatrix>::operator=(other);
150150
this->dist_mtx_ = other.dist_mtx_;
151151
this->local_sparsity_ = other.local_sparsity_;
152152
this->non_local_sparsity_ = other.non_local_sparsity_;
@@ -215,10 +215,30 @@ public:
215215
local_sparsity_ = repart_loc_sparsity;
216216
non_local_sparsity_ = repart_non_loc_sparsity;
217217

218+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
219+
<< " build_localized_partition \n";
220+
//<< " dim " << local_sparsity_->dim[0] << " send idxs size "
221+
//<< dst_comm_pattern.send_idxs.size() << " target ids "
222+
//<< dst_comm_pattern.target_ids << " target sizes "
223+
//<< dst_comm_pattern.target_sizes << "\n";
224+
218225
auto localized_partition = local_part_type::build_from_blocked_recv(
219226
exec, local_sparsity_->dim[0], dst_comm_pattern->send_idxs,
220227
dst_comm_pattern->target_ids, dst_comm_pattern->target_sizes);
221228

229+
std::cout << __FILE__ << " rank " << rank << " local sparsity size "
230+
<< local_sparsity_->size_ << " local sparsity dim ["
231+
<< local_sparsity_->dim[0] << "x" << local_sparsity_->dim[1]
232+
<< "] non_local sparsity size " << non_local_sparsity_->size_
233+
<< " non local sparsity dim [" << non_local_sparsity_->dim[0]
234+
<< "x" << non_local_sparsity_->dim[1] << "] target_ids "
235+
<< dst_comm_pattern->target_ids << " target_sizes "
236+
<< dst_comm_pattern->target_sizes << " target_send_idxs.size "
237+
<< dst_comm_pattern->send_idxs.size()
238+
<< " non_local_sparsity.size " << non_local_sparsity_->size_
239+
<< " get_recv_indices "
240+
<< localized_partition->get_recv_indices().get_num_elems()
241+
<< " \n";
222242

223243
auto sparse_comm =
224244
sparse_communicator::create(comm, localized_partition);
@@ -264,11 +284,11 @@ public:
264284
non_local_sparsity_->row_idxs,
265285
non_local_sparsity_->col_idxs, non_local_coeffs),
266286
sparse_comm);
267-
268-
269287
update_impl(exec_handler, matrix_format, repartitioner, host_A, dist_A,
270288
local_sparsity_, non_local_sparsity_, src_comm_pattern,
271289
local_interfaces);
290+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
291+
<< " dnoe update impl \n";
272292

273293
auto ret = std::make_shared<RepartDistMatrix>(
274294
exec, comm, repartitioner.get_repart_dim(), dist_A->get_size(),
@@ -305,6 +325,8 @@ public:
305325
auto exec = exec_handler.get_ref_exec();
306326
auto device_exec = exec_handler.get_device_exec();
307327
auto ranks_per_gpu = repartitioner.get_ranks_per_gpu();
328+
bool requires_host_buffer = exec_handler.get_gko_force_host_buffer();
329+
308330
label rank{repartitioner.get_rank(exec_handler)};
309331
label owner_rank = repartitioner.get_owner_rank(exec_handler);
310332
bool owner = repartitioner.is_owner(exec_handler);
@@ -314,8 +336,6 @@ public:
314336
auto diag_comm_pattern = compute_send_recv_counts(
315337
exec_handler, ranks_per_gpu, nrows, local_matrix_nnz,
316338
local_matrix_nnz - nrows, 0);
317-
318-
319339
label upper_nnz = host_A->get_upper_nnz();
320340
auto upper_comm_pattern = compute_send_recv_counts(
321341
exec_handler, ranks_per_gpu, upper_nnz, local_matrix_nnz, 0,
@@ -325,18 +345,29 @@ public:
325345
local_matrix_nnz, upper_nnz, nrows);
326346

327347
scalar *local_ptr;
348+
scalar *local_ptr_2;
349+
label nnz=0;
328350

329351
// update main values
352+
std::vector<scalar> loc_buffer;
330353
if (owner) {
331354
using Coo = gko::matrix::Coo<scalar, label>;
332355
auto local_mtx = dist_A->get_local_matrix();
333356

357+
334358
std::shared_ptr<const Coo> local =
335359
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
336360
dist_A->get_local_matrix())
337361
->get_combination()
338362
->get_operators()[0]);
339-
local_ptr = const_cast<scalar *>(local->get_const_values());
363+
nnz = local->get_num_stored_elements();
364+
if (requires_host_buffer) {
365+
loc_buffer.resize(nnz);
366+
local_ptr = loc_buffer.data();
367+
local_ptr_2 = const_cast<scalar *>(local->get_const_values());
368+
} else {
369+
local_ptr = const_cast<scalar *>(local->get_const_values());
370+
}
340371
}
341372
communicate_values(exec_handler, diag_comm_pattern, host_A->get_diag(),
342373
local_ptr);
@@ -352,7 +383,26 @@ public:
352383
communicate_values(exec_handler, lower_comm_pattern,
353384
host_A->get_lower(), local_ptr);
354385
}
355-
386+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
387+
<< " done comm local mtx \n";
388+
389+
if (requires_host_buffer) {
390+
auto host_buffer_view =
391+
gko::array<scalar>::view(exec, nnz, local_ptr);
392+
auto target_buffer_view =
393+
gko::array<scalar>::view(device_exec, nnz, local_ptr_2);
394+
target_buffer_view = host_buffer_view;
395+
}
396+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
397+
<< " done copy to device \n";
398+
399+
if (requires_host_buffer) {
400+
auto host_buffer_view =
401+
gko::array<scalar>::view(exec, nnz, local_ptr);
402+
auto target_buffer_view =
403+
gko::array<scalar>::view(device_exec, nnz, local_ptr_2);
404+
target_buffer_view = host_buffer_view;
405+
}
356406
// copy interface values
357407
auto comm = *exec_handler.get_communicator().get();
358408
if (owner) {
@@ -364,6 +414,8 @@ public:
364414
label tag = 0;
365415
label comm_rank, comm_size;
366416
scalar *recv_buffer_ptr;
417+
scalar *recv_buffer_ptr_2;
418+
std::vector<scalar> host_recv_buffer;
367419
label remain_host_interfaces = host_A->get_interface_size();
368420
for (auto [is_local, comm_rank] : local_interfaces) {
369421
label &ctr = (is_local) ? loc_ctr : nloc_ctr;
@@ -383,19 +435,35 @@ public:
383435
comm_size =
384436
non_local_sparsity->interface_spans[ctr].length();
385437
}
386-
recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());
438+
439+
if (requires_host_buffer) {
440+
host_recv_buffer.resize(comm_size);
441+
recv_buffer_ptr = host_recv_buffer.data();
442+
recv_buffer_ptr_2 = const_cast<scalar *>(mtx->get_const_values());
443+
} else {
444+
recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());
445+
}
387446

388447
if (comm_rank != rank) {
389-
comm.recv(exec, recv_buffer_ptr, comm_size, comm_rank, tag);
448+
comm.recv(device_exec, recv_buffer_ptr, comm_size, comm_rank, tag);
449+
if (requires_host_buffer) {
450+
auto host_buffer_view =
451+
gko::array<scalar>::view(exec, comm_size, recv_buffer_ptr);
452+
auto target_buffer_view =
453+
gko::array<scalar>::view(device_exec, comm_size, recv_buffer_ptr_2);
454+
target_buffer_view = host_buffer_view;
455+
}
456+
390457
} else {
391458
// if data is already on this rank
392459
auto data_view = gko::array<scalar>::const_view(
393460
exec, comm_size,
394461
host_A->get_interface_data(host_interface_ctr));
395462

396463
// TODO FIXME this needs target executor
464+
recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());
397465
auto target_view = gko::array<scalar>::view(
398-
exec, comm_size, recv_buffer_ptr);
466+
device_exec, comm_size, recv_buffer_ptr);
399467

400468
target_view = data_view;
401469

@@ -409,7 +477,7 @@ public:
409477
auto neg_one = gko::initialize<vec>({-1.0}, exec);
410478
auto interface_dense = vec::create(
411479
exec, gko::dim<2>{comm_size, 1},
412-
gko::array<scalar>::view(exec, comm_size, recv_buffer_ptr),
480+
gko::array<scalar>::view(device_exec, comm_size, recv_buffer_ptr),
413481
1);
414482

415483
interface_dense->scale(neg_one);
@@ -423,37 +491,43 @@ public:
423491
label comm_size =
424492
src_comm_pattern->target_sizes.get_const_data()[i];
425493
const scalar *send_buffer_ptr = host_A->get_interface_data(i);
426-
comm.send(exec, send_buffer_ptr, comm_size, owner_rank, tag);
494+
comm.send(device_exec, send_buffer_ptr, comm_size, owner_rank, tag);
427495
}
428496
}
429-
430497
// reorder updated values
431-
if (owner) {
432-
// NOTE local sparsity size includes the interfaces
433-
using Coo = gko::matrix::Coo<scalar, label>;
434-
using dim_type = gko::dim<2>::dimension_type;
435-
std::shared_ptr<const Coo> local =
436-
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
437-
dist_A->get_local_matrix())
438-
->get_combination()
439-
->get_operators()[0]);
440-
auto local_elements = local->get_num_stored_elements();
441-
local_ptr = const_cast<scalar *>(local->get_const_values());
442-
// TODO make sure this doesn't copy
443-
// create a non owning dense matrix of local_values
444-
445-
auto row_collection = gko::share(gko::matrix::Dense<scalar>::create(
446-
exec, gko::dim<2>{static_cast<dim_type>(local_elements), 1},
447-
gko::array<scalar>::view(exec, local_elements, local_ptr), 1));
448-
449-
auto mapping_view = gko::array<label>::view(
450-
exec, local_elements, local_sparsity->ldu_mapping.get_data());
451-
452-
453-
// TODO this needs to copy ldu_mapping to the device
454-
auto dense_vec = row_collection->clone();
455-
dense_vec->row_gather(&mapping_view, row_collection.get());
456-
}
498+
if (owner) {
499+
// NOTE local sparsity size includes the interfaces
500+
using Coo = gko::matrix::Coo<scalar, label>;
501+
using dim_type = gko::dim<2>::dimension_type;
502+
std::shared_ptr<const Coo> local =
503+
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
504+
dist_A->get_local_matrix())
505+
->get_combination()
506+
->get_operators()[0]);
507+
auto local_elements = local->get_num_stored_elements();
508+
local_ptr = const_cast<scalar *>(local->get_const_values());
509+
// TODO make sure this doesn't copy
510+
// create a non owning dense matrix of local_values
511+
512+
auto row_collection = gko::share(gko::matrix::Dense<scalar>::create(
513+
device_exec, gko::dim<2>{static_cast<dim_type>(local_elements), 1},
514+
gko::array<scalar>::view(device_exec, local_elements, local_ptr), 1));
515+
auto mapping_view = gko::array<label>::view(
516+
exec, local_elements, local_sparsity->ldu_mapping.get_data());
517+
518+
519+
// TODO this needs to copy ldu_mapping to the device
520+
auto dense_vec = row_collection->clone();
521+
//auto dense_vec = gko::share(gko::matrix::Dense<scalar>::create(exec, row_collection->get_size()));
522+
523+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
524+
<< " reorder \n";
525+
dense_vec->row_gather(&mapping_view, row_collection.get());
526+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
527+
<< " reorder \n";
528+
}
529+
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
530+
<< "done reorder \n";
457531
};
458532

459533
RepartDistMatrix(

0 commit comments

Comments
 (0)