Skip to content

Commit 1c4220b

Browse files
committed
Move host-side allocation to benchmarks and reuse device with UVM
This commit puts benchmarks in control of allocating the host memory used for verifying the results. This enables benchmarks that use Unified Memory for the device allocations, to avoid the host-side allocation and just pass pointers to the device allocation to the benchmark driver. Closes UoB-HPC#128 .
1 parent 321ba62 commit 1c4220b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+605
-532
lines changed

Diff for: CMakeLists.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
4444
message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
4545
endif ()
4646

47+
option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
48+
4749
# setup some defaults flags for everything
4850
set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
49-
set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
51+
set(DEFAULT_RELEASE_FLAGS -O3)
52+
if (BUILD_NATIVE)
53+
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
54+
endif()
5055

5156
macro(hint_flag FLAG DESCRIPTION)
5257
if (NOT DEFINED ${FLAG})

Diff for: src/Stream.h

+4-9
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,10 @@
77

88
#pragma once
99

10+
#include <array>
1011
#include <vector>
1112
#include <string>
12-
13-
// Array values
14-
#define startA (0.1)
15-
#define startB (0.2)
16-
#define startC (0.0)
17-
#define startScalar (0.4)
13+
#include "benchmark.h"
1814

1915
template <class T>
2016
class Stream
@@ -31,9 +27,8 @@ class Stream
3127
virtual void nstream() = 0;
3228
virtual T dot() = 0;
3329

34-
// Copy memory between host and device
35-
virtual void init_arrays(T initA, T initB, T initC) = 0;
36-
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
30+
// Set pointers to read from arrays
31+
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
3732
};
3833

3934
// Implementation specific device functions

Diff for: src/StreamModels.h

+17-17
Original file line numberDiff line numberDiff line change
@@ -35,67 +35,67 @@
3535
#include "FutharkStream.h"
3636
#endif
3737

38-
template <typename T>
39-
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
38+
template <typename T, typename...Args>
39+
std::unique_ptr<Stream<T>> make_stream(Args... args) {
4040
#if defined(CUDA)
4141
// Use the CUDA implementation
42-
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
42+
return std::make_unique<CUDAStream<T>>(args...);
4343

4444
#elif defined(HIP)
4545
// Use the HIP implementation
46-
return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
46+
return std::make_unique<HIPStream<T>>(args...);
4747

4848
#elif defined(HC)
4949
// Use the HC implementation
50-
return std::make_unique<HCStream<T>>(array_size, deviceIndex);
50+
return std::make_unique<HCStream<T>>(args...);
5151

5252
#elif defined(OCL)
5353
// Use the OpenCL implementation
54-
return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
54+
return std::make_unique<OCLStream<T>>(args...);
5555

5656
#elif defined(USE_RAJA)
5757
// Use the RAJA implementation
58-
return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
58+
return std::make_unique<RAJAStream<T>>(args...);
5959

6060
#elif defined(KOKKOS)
6161
// Use the Kokkos implementation
62-
return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
62+
return std::make_unique<KokkosStream<T>>(args...);
6363

6464
#elif defined(STD_DATA)
6565
// Use the C++ STD data-oriented implementation
66-
return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
66+
return std::make_unique<STDDataStream<T>>(args...);
6767

6868
#elif defined(STD_INDICES)
6969
// Use the C++ STD index-oriented implementation
70-
return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
70+
return std::make_unique<STDIndicesStream<T>>(args...);
7171

7272
#elif defined(STD_RANGES)
7373
// Use the C++ STD ranges implementation
74-
return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
74+
return std::make_unique<STDRangesStream<T>>(args...);
7575

7676
#elif defined(TBB)
7777
// Use the C++20 implementation
78-
return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
78+
return std::make_unique<TBBStream<T>>(args...);
7979

8080
#elif defined(THRUST)
8181
// Use the Thrust implementation
82-
return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
82+
return std::make_unique<ThrustStream<T>>(args...);
8383

8484
#elif defined(ACC)
8585
// Use the OpenACC implementation
86-
return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
86+
return std::make_unique<ACCStream<T>>(args...);
8787

8888
#elif defined(SYCL) || defined(SYCL2020)
8989
// Use the SYCL implementation
90-
return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
90+
return std::make_unique<SYCLStream<T>>(args...);
9191

9292
#elif defined(OMP)
9393
// Use the OpenMP implementation
94-
return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
94+
return std::make_unique<OMPStream<T>>(args...);
9595

9696
#elif defined(FUTHARK)
9797
// Use the Futhark implementation
98-
return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
98+
return std::make_unique<FutharkStream<T>>(args...);
9999

100100
#else
101101

Diff for: src/acc/ACCStream.cpp

+10-10
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
#include "ACCStream.h"
99

1010
template <class T>
11-
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
12-
: array_size{ARRAY_SIZE}
11+
ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
12+
T initA, T initB, T initC)
13+
: array_size{array_size}
1314
{
1415
acc_device_t device_type = acc_get_device_type();
15-
acc_set_device_num(device, device_type);
16+
acc_set_device_num(device_id, device_type);
1617

1718
// Set up data region on device
1819
this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
2526

2627
#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
2728
{}
29+
30+
init_arrays(initA, initB, initC);
2831
}
2932

3033
template <class T>
@@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
6265
}
6366

6467
template <class T>
65-
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
68+
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
6669
{
6770
T *a = this->a;
6871
T *b = this->b;
6972
T *c = this->c;
7073
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
7174
{}
7275

73-
for (intptr_t i = 0; i < array_size; i++)
74-
{
75-
h_a[i] = a[i];
76-
h_b[i] = b[i];
77-
h_c[i] = c[i];
78-
}
76+
h_a = a;
77+
h_b = b;
78+
h_c = c;
7979
}
8080

8181
template <class T>

Diff for: src/acc/ACCStream.h

+13-20
Original file line numberDiff line numberDiff line change
@@ -19,32 +19,25 @@
1919
template <class T>
2020
class ACCStream : public Stream<T>
2121
{
22-
struct A{
23-
T *a;
24-
T *b;
25-
T *c;
26-
};
27-
28-
protected:
2922
// Size of arrays
3023
intptr_t array_size;
31-
A aa;
3224
// Device side pointers
33-
T *a;
34-
T *b;
35-
T *c;
25+
T* restrict a;
26+
T* restrict b;
27+
T* restrict c;
3628

3729
public:
38-
ACCStream(const intptr_t, int);
30+
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
31+
T initA, T initB, T initC);
3932
~ACCStream();
4033

41-
virtual void copy() override;
42-
virtual void add() override;
43-
virtual void mul() override;
44-
virtual void triad() override;
45-
virtual void nstream() override;
46-
virtual T dot() override;
34+
void copy() override;
35+
void add() override;
36+
void mul() override;
37+
void triad() override;
38+
void nstream() override;
39+
T dot() override;
4740

48-
virtual void init_arrays(T initA, T initB, T initC) override;
49-
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
41+
void get_arrays(T const*& a, T const*& b, T const*& c) override;
42+
void init_arrays(T initA, T initB, T initC);
5043
};

Diff for: src/benchmark.h

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#pragma once
2+
3+
#include <algorithm>
4+
#include <array>
5+
#include <initializer_list>
6+
#include <iostream>
7+
8+
// Array values
9+
#define startA (0.1)
10+
#define startB (0.2)
11+
#define startC (0.0)
12+
#define startScalar (0.4)
13+
14+
// Benchmark Identifier: identifies individual & groups of benchmarks:
15+
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
16+
// - All: all kernels.
17+
// - Individual kernels only.
18+
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
19+
20+
struct Benchmark {
21+
BenchId id;
22+
char const* label;
23+
// Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
24+
// bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
25+
size_t weight;
26+
// Is it one of: Copy, Mul, Add, Triad, Dot?
27+
bool classic = false;
28+
};
29+
30+
// Benchmarks in the order in which - if present - should be run for validation purposes:
31+
constexpr size_t num_benchmarks = 6;
32+
constexpr std::array<Benchmark, num_benchmarks> bench = {
33+
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
34+
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
35+
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
36+
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
37+
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
38+
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
39+
};
40+
41+
// Which buffers are needed by each benchmark
42+
inline bool needs_buffer(BenchId id, char n) {
43+
auto in = [n](std::initializer_list<char> values) {
44+
return std::find(values.begin(), values.end(), n) != values.end();
45+
};
46+
switch(id) {
47+
case BenchId::All: return in({'a','b','c'});
48+
case BenchId::Classic: return in({'a','b','c'});
49+
case BenchId::Copy: return in({'a','c'});
50+
case BenchId::Mul: return in({'b','c'});
51+
case BenchId::Add: return in({'a','b','c'});
52+
case BenchId::Triad: return in({'a','b','c'});
53+
case BenchId::Dot: return in({'a','b'});
54+
case BenchId::Nstream: return in({'a','b','c'});
55+
default:
56+
std::cerr << "Unknown benchmark" << std::endl;
57+
abort();
58+
}
59+
}
60+
61+
// Returns true if the benchmark needs to be run:
62+
inline bool run_benchmark(BenchId selection, Benchmark const& b) {
63+
if (selection == BenchId::All) return true;
64+
if (selection == BenchId::Classic && b.classic) return true;
65+
return selection == b.id;
66+
}

Diff for: src/ci-prepare-bionic.sh

+15-17
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,9 @@ setup_aocc() {
138138

139139
setup_nvhpc() {
140140
echo "Preparing Nvidia HPC SDK"
141-
local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
142-
local nvhpc_release="2023_231"
143-
local cuda_ver="12.0"
141+
local nvhpc_ver="24.5"
142+
local nvhpc_release="2024_245"
143+
local cuda_ver="12.4"
144144

145145
local tarball="nvhpc_$nvhpc_ver.tar.gz"
146146

@@ -289,20 +289,18 @@ setup_dpcpp() {
289289
check_size
290290
}
291291

292-
setup_hipsycl() {
292+
setup_adaptivecpp() {
293293

294294
sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
295-
local hipsycl_ver="0.9.1"
296-
local tarball="v$hipsycl_ver.tar.gz"
297-
local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
298-
299-
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
300-
# local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
295+
local adaptivecpp_ver="24.02.0"
296+
local tarball="adaptivecpp_v${adaptivecpp_ver}.tar.gz"
297+
local install_dir="$PWD/adaptivecpp_dist_v${adaptivecpp_ver}"
298+
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/refs/tags/v${adaptivecpp_ver}.tar.gz"
301299

302300
get_and_untar "$tarball" "$url"
303301

304302
if [ "$SETUP" = true ]; then
305-
local src="$PWD/AdaptiveCpp-$hipsycl_ver"
303+
local src="$PWD/AdaptiveCpp-v${adaptivecpp_ver}"
306304
rm -rf "$src/build"
307305
rm -rf "$install_dir"
308306
cmake "-B$src/build" "-H$src" \
@@ -315,10 +313,10 @@ setup_hipsycl() {
315313
cmake --build "$src/build" --target install -j "$(nproc)"
316314
fi
317315

318-
export_var HIPSYCL_DIR "$install_dir"
319-
verify_dir_exists "$HIPSYCL_DIR"
320-
# note: this will forward --version to the default compiler so it won't say anything about hipsycl
321-
"$HIPSYCL_DIR/bin/syclcc-clang" --version
316+
export_var AdaptiveCpp_DIR "$install_dir"
317+
verify_dir_exists "$AdaptiveCpp_DIR"
318+
# note: this will forward --version to the default compiler so it won't say anything about adaptivecpp
319+
"$AdaptiveCpp_DIR/bin/syclcc-clang" --version
322320
check_size
323321
}
324322

@@ -391,7 +389,7 @@ setup_cmake() {
391389
}
392390

393391
if [ "$PARALLEL" = true ]; then
394-
(setup_clang_gcc && setup_rocm && setup_hipsycl) & # these need apt so run sequentially
392+
(setup_clang_gcc && setup_rocm && setup_adaptivecpp) & # these need apt so run sequentially
395393
setup_cmake &
396394
setup_oclcpu &
397395
setup_aocc &
@@ -406,7 +404,7 @@ else
406404
# these need apt
407405
setup_clang_gcc
408406
setup_rocm
409-
setup_hipsycl
407+
setup_adaptivecpp
410408
setup_cmake
411409
setup_aocc
412410
setup_oclcpu

Diff for: src/ci-test-compile.sh

+6-6
Original file line numberDiff line numberDiff line change
@@ -335,10 +335,10 @@ build_dpcpp() {
335335
# run_build intel_build "dpcpp" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} -DSYCL_COMPILER=ONEAPI-DPCPP"
336336
}
337337

338-
build_hipsycl() {
339-
run_build hipsycl_build "syclcc" sycl "
340-
-DSYCL_COMPILER=HIPSYCL \
341-
-DSYCL_COMPILER_DIR=${HIPSYCL_DIR:?}"
338+
build_adaptivecpp() {
339+
run_build adaptivecpp_build "syclcc" sycl "
340+
-DSYCL_COMPILER=AdaptiveCpp \
341+
-DSYCL_COMPILER_DIR=${AdaptiveCpp_DIR:?}"
342342
}
343343

344344
echo "Test compiling with ${COMPILER} CXX for ${MODEL} model"
@@ -352,7 +352,7 @@ aocc) build_aocc ;;
352352
aomp) build_aomp ;;
353353
hip) build_hip ;;
354354
dpcpp) build_dpcpp ;;
355-
hipsycl) build_hipsycl ;;
355+
adaptivecpp) build_adaptivecpp ;;
356356

357357
# XXX below are local only; licence or very large download required, candidate for local runner
358358
icpx) build_icpx ;;
@@ -366,7 +366,7 @@ all)
366366
build_aomp
367367
build_hip
368368
build_dpcpp
369-
build_hipsycl
369+
build_adaptivecpp
370370

371371
build_icpx
372372
build_icpc

0 commit comments

Comments
 (0)