Skip to content

Commit 2b9129e

Browse files
committed
Move host-side allocation to benchmarks and reuse device with UVM
This commit puts benchmarks in control of allocating the host memory used for verifying the results. This enables benchmarks that use Unified Memory for the device allocations, to avoid the host-side allocation and just pass pointers to the device allocation to the benchmark driver. Closes UoB-HPC#128 .
1 parent 321ba62 commit 2b9129e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+609
-533
lines changed

Diff for: CMakeLists.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
4444
message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
4545
endif ()
4646

47+
option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
48+
4749
# setup some defaults flags for everything
4850
set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
49-
set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
51+
set(DEFAULT_RELEASE_FLAGS -O3)
52+
if (BUILD_NATIVE)
53+
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
54+
endif()
5055

5156
macro(hint_flag FLAG DESCRIPTION)
5257
if (NOT DEFINED ${FLAG})

Diff for: src/Stream.h

+4-9
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,10 @@
77

88
#pragma once
99

10+
#include <array>
1011
#include <vector>
1112
#include <string>
12-
13-
// Array values
14-
#define startA (0.1)
15-
#define startB (0.2)
16-
#define startC (0.0)
17-
#define startScalar (0.4)
13+
#include "benchmark.h"
1814

1915
template <class T>
2016
class Stream
@@ -31,9 +27,8 @@ class Stream
3127
virtual void nstream() = 0;
3228
virtual T dot() = 0;
3329

34-
// Copy memory between host and device
35-
virtual void init_arrays(T initA, T initB, T initC) = 0;
36-
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
30+
// Set pointers to read from arrays
31+
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
3732
};
3833

3934
// Implementation specific device functions

Diff for: src/StreamModels.h

+17-17
Original file line numberDiff line numberDiff line change
@@ -35,67 +35,67 @@
3535
#include "FutharkStream.h"
3636
#endif
3737

38-
template <typename T>
39-
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
38+
template <typename T, typename...Args>
39+
std::unique_ptr<Stream<T>> make_stream(Args... args) {
4040
#if defined(CUDA)
4141
// Use the CUDA implementation
42-
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
42+
return std::make_unique<CUDAStream<T>>(args...);
4343

4444
#elif defined(HIP)
4545
// Use the HIP implementation
46-
return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
46+
return std::make_unique<HIPStream<T>>(args...);
4747

4848
#elif defined(HC)
4949
// Use the HC implementation
50-
return std::make_unique<HCStream<T>>(array_size, deviceIndex);
50+
return std::make_unique<HCStream<T>>(args...);
5151

5252
#elif defined(OCL)
5353
// Use the OpenCL implementation
54-
return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
54+
return std::make_unique<OCLStream<T>>(args...);
5555

5656
#elif defined(USE_RAJA)
5757
// Use the RAJA implementation
58-
return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
58+
return std::make_unique<RAJAStream<T>>(args...);
5959

6060
#elif defined(KOKKOS)
6161
// Use the Kokkos implementation
62-
return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
62+
return std::make_unique<KokkosStream<T>>(args...);
6363

6464
#elif defined(STD_DATA)
6565
// Use the C++ STD data-oriented implementation
66-
return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
66+
return std::make_unique<STDDataStream<T>>(args...);
6767

6868
#elif defined(STD_INDICES)
6969
// Use the C++ STD index-oriented implementation
70-
return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
70+
return std::make_unique<STDIndicesStream<T>>(args...);
7171

7272
#elif defined(STD_RANGES)
7373
// Use the C++ STD ranges implementation
74-
return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
74+
return std::make_unique<STDRangesStream<T>>(args...);
7575

7676
#elif defined(TBB)
7777
// Use the C++20 implementation
78-
return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
78+
return std::make_unique<TBBStream<T>>(args...);
7979

8080
#elif defined(THRUST)
8181
// Use the Thrust implementation
82-
return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
82+
return std::make_unique<ThrustStream<T>>(args...);
8383

8484
#elif defined(ACC)
8585
// Use the OpenACC implementation
86-
return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
86+
return std::make_unique<ACCStream<T>>(args...);
8787

8888
#elif defined(SYCL) || defined(SYCL2020)
8989
// Use the SYCL implementation
90-
return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
90+
return std::make_unique<SYCLStream<T>>(args...);
9191

9292
#elif defined(OMP)
9393
// Use the OpenMP implementation
94-
return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
94+
return std::make_unique<OMPStream<T>>(args...);
9595

9696
#elif defined(FUTHARK)
9797
// Use the Futhark implementation
98-
return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
98+
return std::make_unique<FutharkStream<T>>(args...);
9999

100100
#else
101101

Diff for: src/acc/ACCStream.cpp

+10-10
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
#include "ACCStream.h"
99

1010
template <class T>
11-
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
12-
: array_size{ARRAY_SIZE}
11+
ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
12+
T initA, T initB, T initC)
13+
: array_size{array_size}
1314
{
1415
acc_device_t device_type = acc_get_device_type();
15-
acc_set_device_num(device, device_type);
16+
acc_set_device_num(device_id, device_type);
1617

1718
// Set up data region on device
1819
this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
2526

2627
#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
2728
{}
29+
30+
init_arrays(initA, initB, initC);
2831
}
2932

3033
template <class T>
@@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
6265
}
6366

6467
template <class T>
65-
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
68+
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
6669
{
6770
T *a = this->a;
6871
T *b = this->b;
6972
T *c = this->c;
7073
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
7174
{}
7275

73-
for (intptr_t i = 0; i < array_size; i++)
74-
{
75-
h_a[i] = a[i];
76-
h_b[i] = b[i];
77-
h_c[i] = c[i];
78-
}
76+
h_a = a;
77+
h_b = b;
78+
h_c = c;
7979
}
8080

8181
template <class T>

Diff for: src/acc/ACCStream.h

+13-20
Original file line numberDiff line numberDiff line change
@@ -19,32 +19,25 @@
1919
template <class T>
2020
class ACCStream : public Stream<T>
2121
{
22-
struct A{
23-
T *a;
24-
T *b;
25-
T *c;
26-
};
27-
28-
protected:
2922
// Size of arrays
3023
intptr_t array_size;
31-
A aa;
3224
// Device side pointers
33-
T *a;
34-
T *b;
35-
T *c;
25+
T* restrict a;
26+
T* restrict b;
27+
T* restrict c;
3628

3729
public:
38-
ACCStream(const intptr_t, int);
30+
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
31+
T initA, T initB, T initC);
3932
~ACCStream();
4033

41-
virtual void copy() override;
42-
virtual void add() override;
43-
virtual void mul() override;
44-
virtual void triad() override;
45-
virtual void nstream() override;
46-
virtual T dot() override;
34+
void copy() override;
35+
void add() override;
36+
void mul() override;
37+
void triad() override;
38+
void nstream() override;
39+
T dot() override;
4740

48-
virtual void init_arrays(T initA, T initB, T initC) override;
49-
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
41+
void get_arrays(T const*& a, T const*& b, T const*& c) override;
42+
void init_arrays(T initA, T initB, T initC);
5043
};

Diff for: src/benchmark.h

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#pragma once
2+
3+
#include <algorithm>
4+
#include <array>
5+
#include <initializer_list>
6+
#include <iostream>
7+
8+
// Array values
9+
#define startA (0.1)
10+
#define startB (0.2)
11+
#define startC (0.0)
12+
#define startScalar (0.4)
13+
14+
// Benchmark Identifier: identifies individual & groups of benchmarks:
15+
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
16+
// - All: all kernels.
17+
// - Individual kernels only.
18+
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
19+
20+
struct Benchmark {
21+
BenchId id;
22+
char const* label;
23+
// Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
24+
// bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
25+
size_t weight;
26+
// Is it one of: Copy, Mul, Add, Triad, Dot?
27+
bool classic = false;
28+
};
29+
30+
// Benchmarks in the order in which - if present - should be run for validation purposes:
31+
constexpr size_t num_benchmarks = 6;
32+
constexpr std::array<Benchmark, num_benchmarks> bench = {
33+
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
34+
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
35+
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
36+
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
37+
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
38+
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
39+
};
40+
41+
// Which buffers are needed by each benchmark
42+
inline bool needs_buffer(BenchId id, char n) {
43+
auto in = [n](std::initializer_list<char> values) {
44+
return std::find(values.begin(), values.end(), n) != values.end();
45+
};
46+
switch(id) {
47+
case BenchId::All: return in({'a','b','c'});
48+
case BenchId::Classic: return in({'a','b','c'});
49+
case BenchId::Copy: return in({'a','c'});
50+
case BenchId::Mul: return in({'b','c'});
51+
case BenchId::Add: return in({'a','b','c'});
52+
case BenchId::Triad: return in({'a','b','c'});
53+
case BenchId::Dot: return in({'a','b'});
54+
case BenchId::Nstream: return in({'a','b','c'});
55+
default:
56+
std::cerr << "Unknown benchmark" << std::endl;
57+
abort();
58+
}
59+
}
60+
61+
// Returns true if the benchmark needs to be run:
62+
inline bool run_benchmark(BenchId selection, Benchmark const& b) {
63+
if (selection == BenchId::All) return true;
64+
if (selection == BenchId::Classic && b.classic) return true;
65+
return selection == b.id;
66+
}

Diff for: src/ci-prepare-bionic.sh

+19-18
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,9 @@ setup_aocc() {
138138

139139
setup_nvhpc() {
140140
echo "Preparing Nvidia HPC SDK"
141-
local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
142-
local nvhpc_release="2023_231"
143-
local cuda_ver="12.0"
141+
local nvhpc_ver="24.5"
142+
local nvhpc_release="2024_245"
143+
local cuda_ver="12.4"
144144

145145
local tarball="nvhpc_$nvhpc_ver.tar.gz"
146146

@@ -237,7 +237,10 @@ setup_tbb() {
237237

238238
setup_clang_gcc() {
239239

240-
sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6
240+
sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 libc6
241+
wget https://apt.llvm.org/llvm.sh
242+
chmod +x llvm.sh
243+
apt install -y clang-18 libclang-18-dev clang-tools-18 libomp-18-dev llvm-18-dev lld-18
241244

242245
export_var GCC_CXX "$(which g++-12)"
243246
verify_bin_exists "$GCC_CXX"
@@ -289,20 +292,18 @@ setup_dpcpp() {
289292
check_size
290293
}
291294

292-
setup_hipsycl() {
295+
setup_adaptivecpp() {
293296

294297
sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
295-
local hipsycl_ver="0.9.1"
296-
local tarball="v$hipsycl_ver.tar.gz"
297-
local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
298-
299-
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
300-
# local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
298+
local adaptivecpp_ver="24.02.0"
299+
local tarball="AdaptiveCpp-${adaptivecpp_ver}.tar.gz"
300+
local install_dir="$PWD/adaptivecpp_dist_v${adaptivecpp_ver}"
301+
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/refs/tags/v${adaptivecpp_ver}.tar.gz"
301302

302303
get_and_untar "$tarball" "$url"
303304

304305
if [ "$SETUP" = true ]; then
305-
local src="$PWD/AdaptiveCpp-$hipsycl_ver"
306+
local src="$PWD/AdaptiveCpp-${adaptivecpp_ver}"
306307
rm -rf "$src/build"
307308
rm -rf "$install_dir"
308309
cmake "-B$src/build" "-H$src" \
@@ -315,10 +316,10 @@ setup_hipsycl() {
315316
cmake --build "$src/build" --target install -j "$(nproc)"
316317
fi
317318

318-
export_var HIPSYCL_DIR "$install_dir"
319-
verify_dir_exists "$HIPSYCL_DIR"
320-
# note: this will forward --version to the default compiler so it won't say anything about hipsycl
321-
"$HIPSYCL_DIR/bin/syclcc-clang" --version
319+
export_var AdaptiveCpp_DIR "$install_dir"
320+
verify_dir_exists "$AdaptiveCpp_DIR"
321+
# note: this will forward --version to the default compiler so it won't say anything about adaptivecpp
322+
"$AdaptiveCpp_DIR/bin/syclcc-clang" --version
322323
check_size
323324
}
324325

@@ -391,7 +392,7 @@ setup_cmake() {
391392
}
392393

393394
if [ "$PARALLEL" = true ]; then
394-
(setup_clang_gcc && setup_rocm && setup_hipsycl) & # these need apt so run sequentially
395+
(setup_clang_gcc && setup_rocm && setup_adaptivecpp) & # these need apt so run sequentially
395396
setup_cmake &
396397
setup_oclcpu &
397398
setup_aocc &
@@ -406,7 +407,7 @@ else
406407
# these need apt
407408
setup_clang_gcc
408409
setup_rocm
409-
setup_hipsycl
410+
setup_adaptivecpp
410411
setup_cmake
411412
setup_aocc
412413
setup_oclcpu

0 commit comments

Comments
 (0)