Skip to content

Commit bbd0ffa

Browse files
committed
Move host-side allocation to benchmarks and reuse device with UVM
This commit puts benchmarks in control of allocating the host memory used for verifying the results. This enables benchmarks that use Unified Memory for the device allocations, to avoid the host-side allocation and just pass pointers to the device allocation to the benchmark driver. Closes UoB-HPC#128 .
1 parent 321ba62 commit bbd0ffa

40 files changed

+546
-496
lines changed

Diff for: CMakeLists.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
4444
message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
4545
endif ()
4646

47+
option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
48+
4749
# setup some defaults flags for everything
4850
set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
49-
set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
51+
set(DEFAULT_RELEASE_FLAGS -O3)
52+
if (BUILD_NATIVE)
53+
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
54+
endif()
5055

5156
macro(hint_flag FLAG DESCRIPTION)
5257
if (NOT DEFINED ${FLAG})

Diff for: src/Stream.h

+4-9
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,10 @@
77

88
#pragma once
99

10+
#include <array>
1011
#include <vector>
1112
#include <string>
12-
13-
// Array values
14-
#define startA (0.1)
15-
#define startB (0.2)
16-
#define startC (0.0)
17-
#define startScalar (0.4)
13+
#include "benchmark.h"
1814

1915
template <class T>
2016
class Stream
@@ -31,9 +27,8 @@ class Stream
3127
virtual void nstream() = 0;
3228
virtual T dot() = 0;
3329

34-
// Copy memory between host and device
35-
virtual void init_arrays(T initA, T initB, T initC) = 0;
36-
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
30+
// Set pointers to read from arrays
31+
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
3732
};
3833

3934
// Implementation specific device functions

Diff for: src/StreamModels.h

+17-17
Original file line numberDiff line numberDiff line change
@@ -35,67 +35,67 @@
3535
#include "FutharkStream.h"
3636
#endif
3737

38-
template <typename T>
39-
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
38+
template <typename T, typename...Args>
39+
std::unique_ptr<Stream<T>> make_stream(Args... args) {
4040
#if defined(CUDA)
4141
// Use the CUDA implementation
42-
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
42+
return std::make_unique<CUDAStream<T>>(args...);
4343

4444
#elif defined(HIP)
4545
// Use the HIP implementation
46-
return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
46+
return std::make_unique<HIPStream<T>>(args...);
4747

4848
#elif defined(HC)
4949
// Use the HC implementation
50-
return std::make_unique<HCStream<T>>(array_size, deviceIndex);
50+
return std::make_unique<HCStream<T>>(args...);
5151

5252
#elif defined(OCL)
5353
// Use the OpenCL implementation
54-
return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
54+
return std::make_unique<OCLStream<T>>(args...);
5555

5656
#elif defined(USE_RAJA)
5757
// Use the RAJA implementation
58-
return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
58+
return std::make_unique<RAJAStream<T>>(args...);
5959

6060
#elif defined(KOKKOS)
6161
// Use the Kokkos implementation
62-
return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
62+
return std::make_unique<KokkosStream<T>>(args...);
6363

6464
#elif defined(STD_DATA)
6565
// Use the C++ STD data-oriented implementation
66-
return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
66+
return std::make_unique<STDDataStream<T>>(args...);
6767

6868
#elif defined(STD_INDICES)
6969
// Use the C++ STD index-oriented implementation
70-
return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
70+
return std::make_unique<STDIndicesStream<T>>(args...);
7171

7272
#elif defined(STD_RANGES)
7373
// Use the C++ STD ranges implementation
74-
return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
74+
return std::make_unique<STDRangesStream<T>>(args...);
7575

7676
#elif defined(TBB)
7777
// Use the C++20 implementation
78-
return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
78+
return std::make_unique<TBBStream<T>>(args...);
7979

8080
#elif defined(THRUST)
8181
// Use the Thrust implementation
82-
return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
82+
return std::make_unique<ThrustStream<T>>(args...);
8383

8484
#elif defined(ACC)
8585
// Use the OpenACC implementation
86-
return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
86+
return std::make_unique<ACCStream<T>>(args...);
8787

8888
#elif defined(SYCL) || defined(SYCL2020)
8989
// Use the SYCL implementation
90-
return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
90+
return std::make_unique<SYCLStream<T>>(args...);
9191

9292
#elif defined(OMP)
9393
// Use the OpenMP implementation
94-
return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
94+
return std::make_unique<OMPStream<T>>(args...);
9595

9696
#elif defined(FUTHARK)
9797
// Use the Futhark implementation
98-
return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
98+
return std::make_unique<FutharkStream<T>>(args...);
9999

100100
#else
101101

Diff for: src/acc/ACCStream.cpp

+10-10
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
#include "ACCStream.h"
99

1010
template <class T>
11-
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
12-
: array_size{ARRAY_SIZE}
11+
ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
12+
T initA, T initB, T initC)
13+
: array_size{array_size}
1314
{
1415
acc_device_t device_type = acc_get_device_type();
15-
acc_set_device_num(device, device_type);
16+
acc_set_device_num(device_id, device_type);
1617

1718
// Set up data region on device
1819
this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
2526

2627
#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
2728
{}
29+
30+
init_arrays(initA, initB, initC);
2831
}
2932

3033
template <class T>
@@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
6265
}
6366

6467
template <class T>
65-
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
68+
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
6669
{
6770
T *a = this->a;
6871
T *b = this->b;
6972
T *c = this->c;
7073
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
7174
{}
7275

73-
for (intptr_t i = 0; i < array_size; i++)
74-
{
75-
h_a[i] = a[i];
76-
h_b[i] = b[i];
77-
h_c[i] = c[i];
78-
}
76+
h_a = a;
77+
h_b = b;
78+
h_c = c;
7979
}
8080

8181
template <class T>

Diff for: src/acc/ACCStream.h

+13-20
Original file line numberDiff line numberDiff line change
@@ -19,32 +19,25 @@
1919
template <class T>
2020
class ACCStream : public Stream<T>
2121
{
22-
struct A{
23-
T *a;
24-
T *b;
25-
T *c;
26-
};
27-
28-
protected:
2922
// Size of arrays
3023
intptr_t array_size;
31-
A aa;
3224
// Device side pointers
33-
T *a;
34-
T *b;
35-
T *c;
25+
T* restrict a;
26+
T* restrict b;
27+
T* restrict c;
3628

3729
public:
38-
ACCStream(const intptr_t, int);
30+
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
31+
T initA, T initB, T initC);
3932
~ACCStream();
4033

41-
virtual void copy() override;
42-
virtual void add() override;
43-
virtual void mul() override;
44-
virtual void triad() override;
45-
virtual void nstream() override;
46-
virtual T dot() override;
34+
void copy() override;
35+
void add() override;
36+
void mul() override;
37+
void triad() override;
38+
void nstream() override;
39+
T dot() override;
4740

48-
virtual void init_arrays(T initA, T initB, T initC) override;
49-
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
41+
void get_arrays(T const*& a, T const*& b, T const*& c) override;
42+
void init_arrays(T initA, T initB, T initC);
5043
};

Diff for: src/benchmark.h

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#pragma once
2+
3+
#include <algorithm>
4+
#include <array>
5+
#include <initializer_list>
6+
#include <iostream>
7+
8+
// Array values
9+
#define startA (0.1)
10+
#define startB (0.2)
11+
#define startC (0.0)
12+
#define startScalar (0.4)
13+
14+
// Benchmark Identifier: identifies individual & groups of benchmarks:
15+
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
16+
// - All: all kernels.
17+
// - Individual kernels only.
18+
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
19+
20+
struct Benchmark {
21+
BenchId id;
22+
char const* label;
23+
// Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
24+
// bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
25+
size_t weight;
26+
// Is it one of: Copy, Mul, Add, Triad, Dot?
27+
bool classic = false;
28+
};
29+
30+
// Benchmarks in the order in which - if present - should be run for validation purposes:
31+
constexpr size_t num_benchmarks = 6;
32+
constexpr std::array<Benchmark, num_benchmarks> bench = {
33+
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
34+
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
35+
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
36+
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
37+
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
38+
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
39+
};
40+
41+
// Which buffers are needed by each benchmark
42+
inline bool needs_buffer(BenchId id, char n) {
43+
auto in = [n](std::initializer_list<char> values) {
44+
return std::find(values.begin(), values.end(), n) != values.end();
45+
};
46+
switch(id) {
47+
case BenchId::All: return in({'a','b','c'});
48+
case BenchId::Classic: return in({'a','b','c'});
49+
case BenchId::Copy: return in({'a','c'});
50+
case BenchId::Mul: return in({'b','c'});
51+
case BenchId::Add: return in({'a','b','c'});
52+
case BenchId::Triad: return in({'a','b','c'});
53+
case BenchId::Dot: return in({'a','b'});
54+
case BenchId::Nstream: return in({'a','b','c'});
55+
default:
56+
std::cerr << "Unknown benchmark" << std::endl;
57+
abort();
58+
}
59+
}
60+
61+
// Returns true if the benchmark needs to be run:
62+
inline bool run_benchmark(BenchId selection, Benchmark const& b) {
63+
if (selection == BenchId::All) return true;
64+
if (selection == BenchId::Classic && b.classic) return true;
65+
return selection == b.id;
66+
}

Diff for: src/cuda/CUDAStream.cu

+26-14
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ void free_host(T* p) {
7777
}
7878

7979
template <class T>
80-
CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
80+
CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index,
81+
T initA, T initB, T initC)
8182
: array_size(array_size)
8283
{
8384
// Set device
@@ -131,14 +132,20 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
131132
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
132133

133134
// Check buffers fit on the device
134-
if (dprop.totalGlobalMem < total_bytes)
135+
if (dprop.totalGlobalMem < total_bytes) {
136+
std::cerr << "Requested array size of " << total_bytes * 1e-9
137+
<< " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl;
135138
throw std::runtime_error("Device does not have enough memory for all buffers");
139+
}
136140

137141
// Allocate buffers:
138142
d_a = alloc_device<T>(array_size);
139143
d_b = alloc_device<T>(array_size);
140144
d_c = alloc_device<T>(array_size);
141145
sums = alloc_host<T>(dot_num_blocks);
146+
147+
// Initialize buffers:
148+
init_arrays(initA, initB, initC);
142149
}
143150

144151
template <class T>
@@ -204,21 +211,26 @@ void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
204211
}
205212

206213
template <class T>
207-
void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
214+
void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
208215
{
209-
// Copy device memory to host
210-
#if defined(PAGEFAULT) || defined(MANAGED)
211216
CU(cudaStreamSynchronize(stream));
212-
for (intptr_t i = 0; i < array_size; ++i)
213-
{
214-
a[i] = d_a[i];
215-
b[i] = d_b[i];
216-
c[i] = d_c[i];
217-
}
217+
#if defined(PAGEFAULT) || defined(MANAGED)
218+
// Unified memory: return pointers to device memory
219+
a = d_a;
220+
b = d_b;
221+
c = d_c;
218222
#else
219-
CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost));
220-
CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost));
221-
CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost));
223+
// No Unified memory: copy data to the host
224+
size_t nbytes = array_size * sizeof(T);
225+
h_a.resize(array_size);
226+
h_b.resize(array_size);
227+
h_c.resize(array_size);
228+
a = h_a.data();
229+
b = h_b.data();
230+
c = h_c.data();
231+
CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
232+
CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
233+
CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
222234
#endif
223235
}
224236

0 commit comments

Comments
 (0)