gonzalobg
diff --git a/Diff for: ‎CMakeLists.txt
+6-1 b/Diff for: ‎CMakeLists.txt
+6-1
diff --git a/Diff for: ‎src/Stream.h
+4-9 b/Diff for: ‎src/Stream.h
+4-9
diff --git a/Diff for: ‎src/StreamModels.h
+17-17 b/Diff for: ‎src/StreamModels.h
+17-17
diff --git a/Diff for: ‎src/acc/ACCStream.cpp
+10-10 b/Diff for: ‎src/acc/ACCStream.cpp
+10-10
diff --git a/Diff for: ‎src/acc/ACCStream.h
+13-20 b/Diff for: ‎src/acc/ACCStream.h
+13-20
diff --git a/Diff for: ‎src/benchmark.h
+66 b/Diff for: ‎src/benchmark.h
+66
diff --git a/Diff for: ‎src/cuda/CUDAStream.cu
+26-14 b/Diff for: ‎src/cuda/CUDAStream.cu
+26-14
@@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
     message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
 endif ()
 
+option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
+
 # setup some defaults flags for everything
 set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
-set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
+set(DEFAULT_RELEASE_FLAGS -O3)
+if (BUILD_NATIVE)
+    set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
+endif()
 
 macro(hint_flag FLAG DESCRIPTION)
     if (NOT DEFINED ${FLAG})
 
@@ -7,14 +7,10 @@
 
 #pragma once
 
+#include <array>
 #include <vector>
 #include <string>
-
-// Array values
-#define startA (0.1)
-#define startB (0.2)
-#define startC (0.0)
-#define startScalar (0.4)
+#include "benchmark.h"
 
 template <class T>
 class Stream
@@ -31,9 +27,8 @@ class Stream
     virtual void nstream() = 0;
     virtual T dot() = 0;
 
-    // Copy memory between host and device
-    virtual void init_arrays(T initA, T initB, T initC) = 0;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
+    // Set pointers to read from arrays
+    virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
 };
 
 // Implementation specific device functions
 
@@ -35,67 +35,67 @@
 #include "FutharkStream.h"
 #endif
 
-template <typename T>
-std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
+template <typename T, typename...Args>
+std::unique_ptr<Stream<T>> make_stream(Args... args) {
 #if defined(CUDA)
   // Use the CUDA implementation
-  return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<CUDAStream<T>>(args...);
 
 #elif defined(HIP)
   // Use the HIP implementation
-  return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HIPStream<T>>(args...);
 
 #elif defined(HC)
   // Use the HC implementation
-  return std::make_unique<HCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HCStream<T>>(args...);
 
 #elif defined(OCL)
   // Use the OpenCL implementation
-  return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OCLStream<T>>(args...);
 
 #elif defined(USE_RAJA)
   // Use the RAJA implementation
-  return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<RAJAStream<T>>(args...);
 
 #elif defined(KOKKOS)
   // Use the Kokkos implementation
-  return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
+  return std::make_unique<KokkosStream<T>>(args...);
 
 #elif defined(STD_DATA)
   // Use the C++ STD data-oriented implementation
-  return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDDataStream<T>>(args...);
 
 #elif defined(STD_INDICES)
   // Use the C++ STD index-oriented implementation
-  return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDIndicesStream<T>>(args...);
 
 #elif defined(STD_RANGES)
   // Use the C++ STD ranges implementation
-  return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDRangesStream<T>>(args...);
 
 #elif defined(TBB)
   // Use the C++20 implementation
-  return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
+  return std::make_unique<TBBStream<T>>(args...);
 
 #elif defined(THRUST)
   // Use the Thrust implementation
-  return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ThrustStream<T>>(args...);
 
 #elif defined(ACC)
   // Use the OpenACC implementation
-  return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ACCStream<T>>(args...);
 
 #elif defined(SYCL) || defined(SYCL2020)
   // Use the SYCL implementation
-  return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<SYCLStream<T>>(args...);
 
 #elif defined(OMP)
   // Use the OpenMP implementation
-  return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OMPStream<T>>(args...);
 
 #elif defined(FUTHARK)
   // Use the Futhark implementation
-  return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
+  return std::make_unique<FutharkStream<T>>(args...);
 
 #else
 
 
@@ -8,11 +8,12 @@
 #include "ACCStream.h"
 
 template <class T>
-ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
-  : array_size{ARRAY_SIZE}
+ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+			T initA, T initB, T initC)
+  : array_size{array_size}
 {
   acc_device_t device_type = acc_get_device_type();
-  acc_set_device_num(device, device_type);
+  acc_set_device_num(device_id, device_type);
 
   // Set up data region on device
   this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
 
   #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
   #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
 
@@ -19,32 +19,25 @@
 template <class T>
 class ACCStream : public Stream<T>
 {
-  struct A{
-    T *a;
-    T *b;
-    T *c;
-  };
-
-  protected:
     // Size of arrays
     intptr_t array_size;
-    A aa;
     // Device side pointers
-    T *a;
-    T *b;
-    T *c;
+    T* restrict a;
+    T* restrict b;
+    T* restrict c;
 
   public:
-    ACCStream(const intptr_t, int);
+    ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~ACCStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <initializer_list>
+#include <iostream>
+
+// Array values
+#define startA (0.1)
+#define startB (0.2)
+#define startC (0.0)
+#define startScalar (0.4)
+
+// Benchmark Identifier: identifies individual & groups of benchmarks:
+// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
+// - All: all kernels.
+// - Individual kernels only.
+enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
+
+struct Benchmark {
+  BenchId id;
+  char const* label;
+  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
+  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
+  size_t weight;
+  // Is it one of: Copy, Mul, Add, Triad, Dot?
+  bool classic = false;
+};
+
+// Benchmarks in the order in which - if present - should be run for validation purposes:
+constexpr size_t num_benchmarks = 6;
+constexpr std::array<Benchmark, num_benchmarks> bench = {
+  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
+};
+
+// Which buffers are needed by each benchmark
+inline bool needs_buffer(BenchId id, char n) {
+  auto in = [n](std::initializer_list<char> values) {
+    return std::find(values.begin(), values.end(), n) != values.end();
+  };
+  switch(id) {
+  case BenchId::All:     return in({'a','b','c'});       
+  case BenchId::Classic: return in({'a','b','c'});   
+  case BenchId::Copy:    return in({'a','c'});
+  case BenchId::Mul:	 return in({'b','c'});
+  case BenchId::Add:	 return in({'a','b','c'});
+  case BenchId::Triad:   return in({'a','b','c'});
+  case BenchId::Dot:	 return in({'a','b'});
+  case BenchId::Nstream: return in({'a','b','c'});  
+  default:
+    std::cerr << "Unknown benchmark" << std::endl;
+    abort();
+  }
+}
+
+// Returns true if the benchmark needs to be run:
+inline bool run_benchmark(BenchId selection, Benchmark const& b) {
+  if (selection == BenchId::All)                  return true;
+  if (selection == BenchId::Classic && b.classic) return true;
+  return selection == b.id;
+}
@@ -77,7 +77,8 @@ void free_host(T* p) {
 }
 
 template <class T>
-CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
+CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
   : array_size(array_size)
 {
   // Set device
@@ -131,14 +132,20 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
   std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
 
   // Check buffers fit on the device
-  if (dprop.totalGlobalMem < total_bytes)
+  if (dprop.totalGlobalMem < total_bytes) {
+    std::cerr << "Requested array size of " << total_bytes * 1e-9
+	      << " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl;
     throw std::runtime_error("Device does not have enough memory for all buffers");
+  }
 
   // Allocate buffers:
   d_a = alloc_device<T>(array_size);
   d_b = alloc_device<T>(array_size);
   d_c = alloc_device<T>(array_size);
   sums = alloc_host<T>(dot_num_blocks);
+
+  // Initialize buffers:
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -204,21 +211,26 @@ void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  // Copy device memory to host
-#if defined(PAGEFAULT) || defined(MANAGED)
   CU(cudaStreamSynchronize(stream));
-  for (intptr_t i = 0; i < array_size; ++i)
-  {
-    a[i] = d_a[i];
-    b[i] = d_b[i];
-    c[i] = d_c[i];
-  }
+#if defined(PAGEFAULT) || defined(MANAGED)
+  // Unified memory: return pointers to device memory
+  a = d_a;
+  b = d_b;
+  c = d_c;
 #else
-  CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost));
+  // No Unified memory: copy data to the host
+  size_t nbytes = array_size * sizeof(T);
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
 #endif
 }