Skip to content

Commit 0b7fdb3

Browse files
committed
Add Scan, Read, and Write benchmarks
1 parent 62a5051 commit 0b7fdb3

33 files changed

+808
-122
lines changed

Diff for: CMakeLists.txt

+8-5
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ endmacro()
3131
# the final executable name
3232
set(EXE_NAME babelstream)
3333

34-
# for chrono, make_unique, and some basic CXX features, models can overwrite this if required
35-
set(CMAKE_CXX_STANDARD 14)
34+
# for chrono, make_unique, exclusive_scan_, and other basic features, models can override if required
35+
set(CMAKE_CXX_STANDARD 17)
3636

3737
if (NOT CMAKE_BUILD_TYPE)
3838
message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
@@ -177,10 +177,13 @@ else ()
177177
message(STATUS "Selected model : ${MODEL}")
178178
endif ()
179179

180-
if (MODEL STREQUAL "sycl2020")
180+
if (MODEL STREQUAL "sycl2020-acc")
181181
message(FATAL_ERROR "
182-
Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available.
183-
Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM")
182+
Model sycl2020-acc has been renamed to sycl2020 and may be enabled with the -DSYCL_ACCESS=ACCESSOR cmake option.")
183+
endif ()
184+
if (MODEL STREQUAL "sycl2020-usm")
185+
message(FATAL_ERROR "
186+
Model sycl2020-usm has been renamed to sycl2020 and may be enabled with the -DSYCL_ACCESS=USM cmake option.")
184187
endif ()
185188

186189
# load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL

Diff for: src/Stream.h

+9-1
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,15 @@
1111
#include <array>
1212
#include <vector>
1313
#include <string>
14+
#include <type_traits>
1415
#include "benchmark.h"
1516

1617
using std::intptr_t;
1718

19+
template <typename T>
20+
using scan_t = std::conditional_t<sizeof(T) == 4, std::uint32_t,
21+
std::conditional_t<sizeof(T) == 8, std::uint64_t, void>>;
22+
1823
template <class T>
1924
class Stream
2025
{
@@ -29,9 +34,12 @@ class Stream
2934
virtual void triad() = 0;
3035
virtual void nstream() = 0;
3136
virtual T dot() = 0;
37+
virtual void read() = 0;
38+
virtual void write(T initA) = 0;
39+
virtual void scan() = 0;
3240

3341
// Set pointers to read from arrays
34-
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
42+
virtual void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) = 0;
3543
};
3644

3745
// Implementation specific device functions

Diff for: src/acc/ACCStream.cpp

+61-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
// For full license terms please see the LICENSE file distributed with this
66
// source code
77

8+
#include <numeric>
89
#include "ACCStream.h"
910

1011
template <class T>
@@ -24,6 +25,11 @@ ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_
2425
T * restrict b = this->b;
2526
T * restrict c = this->c;
2627

28+
if (needs_buffer(bs, 's')) {
29+
s_i = new scan_t<T>[array_size];
30+
s_o = new scan_t<T>[array_size];
31+
}
32+
2733
#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
2834
{}
2935

@@ -46,6 +52,11 @@ ACCStream<T>::~ACCStream()
4652
delete[] a;
4753
delete[] b;
4854
delete[] c;
55+
56+
if (s_i) {
57+
delete[] s_i;
58+
delete[] s_o;
59+
}
4960
}
5061

5162
template <class T>
@@ -62,10 +73,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
6273
b[i] = initB;
6374
c[i] = initC;
6475
}
76+
77+
if (s_i) {
78+
for (intptr_t i = 0; i < array_size; i++)
79+
{
80+
s_i[i] = scan_t<T>(i);
81+
}
82+
}
6583
}
6684

6785
template <class T>
68-
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
86+
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c, scan_t<T> const*& h_s)
6987
{
7088
T *a = this->a;
7189
T *b = this->b;
@@ -76,6 +94,10 @@ void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
7694
h_a = a;
7795
h_b = b;
7896
h_c = c;
97+
98+
if (s_o) {
99+
h_s = s_o;
100+
}
79101
}
80102

81103
template <class T>
@@ -169,6 +191,44 @@ T ACCStream<T>::dot()
169191
return sum;
170192
}
171193

194+
template <class T>
195+
void ACCStream<T>::read()
196+
{
197+
intptr_t array_size = this->array_size;
198+
T * restrict a = this->a;
199+
#pragma acc parallel loop present(a[0:array_size]) wait
200+
for (intptr_t i = 0; i < array_size; i++)
201+
{
202+
T tmp = a[i];
203+
if (tmp == T(3.14)) {
204+
a[i] *= 2;;
205+
}
206+
}
207+
}
208+
209+
template <class T>
210+
void ACCStream<T>::write(T initA)
211+
{
212+
intptr_t array_size = this->array_size;
213+
T * restrict a = this->a;
214+
#pragma acc parallel loop present(a[0:array_size]) wait
215+
for (intptr_t i = 0; i < array_size; i++)
216+
{
217+
a[i] = initA;
218+
}
219+
}
220+
221+
template <class T>
222+
void ACCStream<T>::scan()
223+
{
224+
if (!s_i) {
225+
throw std::runtime_error("Trying to run scan but storage not allocated");
226+
}
227+
228+
// OpenAcc doesn't have scan; run sequentially
229+
std::exclusive_scan(s_i, s_i + array_size, s_o, scan_t<T>(0));
230+
}
231+
172232
void listDevices(void)
173233
{
174234
// Get number of devices

Diff for: src/acc/ACCStream.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ class ACCStream : public Stream<T>
2525
T* restrict a;
2626
T* restrict b;
2727
T* restrict c;
28+
scan_t<T>* restrict s_i;
29+
scan_t<T>* restrict s_o;
2830

2931
public:
3032
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
@@ -37,7 +39,10 @@ class ACCStream : public Stream<T>
3739
void triad() override;
3840
void nstream() override;
3941
T dot() override;
42+
void read() override;
43+
void write(T initA) override;
44+
void scan() override;
4045

41-
void get_arrays(T const*& a, T const*& b, T const*& c) override;
46+
void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) override;
4247
void init_arrays(T initA, T initB, T initC);
4348
};

Diff for: src/benchmark.h

+13-7
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
1616
// - All: all kernels.
1717
// - Individual kernels only.
18-
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
18+
enum class BenchId : int {Write, Copy, Mul, Add, Triad, Dot, Nstream, Scan, Read, Classic, All};
1919

2020
struct Benchmark {
2121
BenchId id;
@@ -28,14 +28,17 @@ struct Benchmark {
2828
};
2929

3030
// Benchmarks in the order in which - if present - should be run for validation purposes:
31-
constexpr size_t num_benchmarks = 6;
32-
constexpr std::array<Benchmark, num_benchmarks> bench = {
31+
constexpr size_t num_benchmarks = 9;
32+
inline constexpr std::array<Benchmark, num_benchmarks> bench = {
33+
Benchmark { .id = BenchId::Write, .label = "Write", .weight = 1, .classic = false },
3334
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
3435
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
3536
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
3637
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
3738
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
38-
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
39+
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false },
40+
Benchmark { .id = BenchId::Scan, .label = "Scan", .weight = 2, .classic = false },
41+
Benchmark { .id = BenchId::Read, .label = "Read", .weight = 1, .classic = false }
3942
};
4043

4144
// Which buffers are needed by each benchmark
@@ -44,14 +47,17 @@ inline bool needs_buffer(BenchId id, char n) {
4447
return std::find(values.begin(), values.end(), n) != values.end();
4548
};
4649
switch(id) {
47-
case BenchId::All: return in({'a','b','c'});
48-
case BenchId::Classic: return in({'a','b','c'});
50+
case BenchId::All: return in({'a','b','c', 's'});
51+
case BenchId::Classic: return in({'a','b','c'});
4952
case BenchId::Copy: return in({'a','c'});
5053
case BenchId::Mul: return in({'b','c'});
5154
case BenchId::Add: return in({'a','b','c'});
5255
case BenchId::Triad: return in({'a','b','c'});
5356
case BenchId::Dot: return in({'a','b'});
54-
case BenchId::Nstream: return in({'a','b','c'});
57+
case BenchId::Nstream: return in({'a','b','c'});
58+
case BenchId::Read: return in({'a'});
59+
case BenchId::Write: return in({'a'});
60+
case BenchId::Scan: return in({'s'});
5561
default:
5662
std::cerr << "Unknown benchmark" << std::endl;
5763
abort();

Diff for: src/cuda/CUDAStream.cu

+58-3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#include "CUDAStream.h"
88
#include <nvml.h>
9+
#include <thrust/scan.h>
10+
#include <thrust/execution_policy.h>
911

1012
#if !defined(UNROLL_FACTOR)
1113
#define UNROLL_FACTOR 4
@@ -128,7 +130,8 @@ CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int devic
128130
// Size of partial sums for dot kernels
129131
size_t sums_bytes = sizeof(T) * dot_num_blocks;
130132
size_t array_bytes = sizeof(T) * array_size;
131-
size_t total_bytes = array_bytes * size_t(3) + sums_bytes;
133+
size_t scan_bytes = needs_buffer(bs, 's')? size_t(2) * array_size * sizeof(scan_t<T>) : 0;
134+
size_t total_bytes = array_bytes * size_t(3) + scan_bytes + sums_bytes;
132135
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
133136

134137
// Check buffers fit on the device
@@ -144,6 +147,11 @@ CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int devic
144147
d_c = alloc_device<T>(array_size);
145148
sums = alloc_host<T>(dot_num_blocks);
146149

150+
if (needs_buffer(bs, 's')) {
151+
d_si = alloc_device<scan_t<T>>(array_size);
152+
d_so = alloc_device<scan_t<T>>(array_size);
153+
}
154+
147155
// Initialize buffers:
148156
init_arrays(initA, initB, initC);
149157
}
@@ -156,6 +164,10 @@ CUDAStream<T>::~CUDAStream()
156164
free_device(d_b);
157165
free_device(d_c);
158166
free_host(sums);
167+
if (d_si) {
168+
free_device(d_si);
169+
free_device(d_so);
170+
}
159171
}
160172

161173
template <typename F>
@@ -203,22 +215,26 @@ void for_each(size_t array_size, F f) {
203215
template <class T>
204216
void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
205217
{
206-
for_each(array_size, [=,a=d_a,b=d_b,c=d_c] __device__ (size_t i) {
218+
for_each(array_size, [=,a=d_a,b=d_b,c=d_c,s=d_si] __device__ (size_t i) {
207219
a[i] = initA;
208220
b[i] = initB;
209221
c[i] = initC;
222+
if (s) {
223+
s[i] = static_cast<scan_t<T>>(i);
224+
}
210225
});
211226
}
212227

213228
template <class T>
214-
void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
229+
void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s)
215230
{
216231
CU(cudaStreamSynchronize(stream));
217232
#if defined(PAGEFAULT) || defined(MANAGED)
218233
// Unified memory: return pointers to device memory
219234
a = d_a;
220235
b = d_b;
221236
c = d_c;
237+
s = d_so;
222238
#else
223239
// No Unified memory: copy data to the host
224240
size_t nbytes = array_size * sizeof(T);
@@ -231,7 +247,14 @@ void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
231247
CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
232248
CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
233249
CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
250+
if (d_so) {
251+
size_t nbytes = array_size * sizeof(scan_t<T>);
252+
h_s.resize(array_size);
253+
s = h_s.data();
254+
CU(cudaMemcpy(h_s.data(), d_so, nbytes, cudaMemcpyDeviceToHost));
255+
}
234256
#endif
257+
CU(cudaStreamSynchronize(stream));
235258
}
236259

237260
template <class T>
@@ -308,6 +331,38 @@ T CUDAStream<T>::dot()
308331
return sum;
309332
}
310333

334+
template <class T>
335+
void CUDAStream<T>::scan()
336+
{
337+
if (!d_so) {
338+
std::cerr << "Trying to run scan but storage not allocated" << std::endl;
339+
std::terminate();
340+
}
341+
thrust::exclusive_scan(thrust::cuda::par.on(stream), d_si, d_si + array_size, d_so);
342+
CU(cudaPeekAtLastError());
343+
CU(cudaStreamSynchronize(stream));
344+
}
345+
346+
template <class T>
347+
void CUDAStream<T>::read()
348+
{
349+
for_each(array_size, [a=d_a] __device__ (size_t i) {
350+
T tmp = a[i];
351+
// Control-dependency on loading a[i]: never true, but checking it requires loading value:
352+
if (tmp == T(3.14)) {
353+
a[i] *= 2;
354+
}
355+
});
356+
}
357+
358+
template <class T>
359+
void CUDAStream<T>::write(T initA)
360+
{
361+
for_each(array_size, [a=d_a, initA] __device__ (size_t i) {
362+
a[i] = initA;
363+
});
364+
}
365+
311366
void listDevices(void)
312367
{
313368
// Get number of devices

Diff for: src/cuda/CUDAStream.h

+8-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ class CUDAStream : public Stream<T>
3535

3636
// If UVM is disabled, host arrays for verification purposes
3737
std::vector<T> h_a, h_b, h_c;
38+
std::vector<scan_t<T>> h_s;
39+
40+
// Allocate memory for scan
41+
scan_t<T> *d_si, *d_so;
3842

3943
// Number of blocks for dot kernel
4044
intptr_t dot_num_blocks;
@@ -50,7 +54,10 @@ class CUDAStream : public Stream<T>
5054
void triad() override;
5155
void nstream() override;
5256
T dot() override;
57+
void read() override;
58+
void write(T initA) override;
59+
void scan() override;
5360

54-
void get_arrays(T const*& a, T const*& b, T const*& c) override;
61+
void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) override;
5562
void init_arrays(T initA, T initB, T initC);
5663
};

0 commit comments

Comments
 (0)