spcl · mcopik · Jun 4, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 11, 2024
diff --git a/benchmarks/bots/nqueens/dispatcher.cpp b/benchmarks/bots/nqueens/dispatcher.cpp
@@ -3,11 +3,11 @@
 #include <numeric>
 #include <span>
 #include <thread>
-#include <tuple>
-#include <vector>
 
 #include "./dispatcher.hpp"
 
+#include "../../include/measurement.hpp"
+
 #include <argparse/argparse.hpp>
 #include <cereal/types/vector.hpp>
 #include <cppless/dispatcher/aws-lambda.hpp>
@@ -19,10 +19,12 @@
 
 using dispatcher = cppless::aws_lambda_nghttp2_dispatcher<>::from_env;
 namespace lambda = cppless::aws;
+constexpr unsigned int timeout = 30;
 constexpr unsigned int memory_limit = 2048;
 constexpr unsigned int ephemeral_storage = 64;
 using cpu_intensive =
-    lambda::config<lambda::with_memory<memory_limit>,
+    lambda::config<lambda::with_timeout<timeout>,
+                   lambda::with_memory<memory_limit>,
                    lambda::with_ephemeral_storage<ephemeral_storage>>;
 
 auto nqueens(dispatcher_args args) -> unsigned long
@@ -32,31 +34,87 @@ auto nqueens(dispatcher_args args) -> unsigned long
 
   dispatcher aws;
   auto instance = aws.create_instance();
+  unsigned long res;
+
+  serverless_measurements benchmarker;
+  for(int rep = 0; rep < args.repetitions; ++rep) {
+
+    benchmarker.start_repetition(rep);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    std::vector<unsigned char> prefixes;
+    prefixes.reserve(pow(size, prefix_length));
+    std::vector<unsigned char> scratchpad(size);
+
+    nqueens_prefixes(0,
+                     prefix_length,
+                     0,
+                     size,
+                     std::span<unsigned char> {scratchpad},
+                     prefixes);
+
+    int total_items = prefixes.size() / prefix_length;
+    int work_size = total_items / args.threads;
+    int work_leftover = total_items % args.threads;
+    std::vector<int> indices;
+    int idx = 0;
+    indices.emplace_back(0);
+    for (unsigned int t = 0; t < args.threads; t++) {
+      int new_idx = idx + (t < work_leftover ? work_size + 1 : work_size) * prefix_length;
+      indices.emplace_back(new_idx);
+      idx = new_idx;
+    }
+    indices.emplace_back(idx);
+
+    std::size_t num_prefixes = prefixes.size() / prefix_length;
+    std::vector<unsigned long> results(args.threads);
+
+    auto dispatch_start = std::chrono::high_resolution_clock::now();
+    for (unsigned int t = 0; t < args.threads; t++) {
+
+      int start = indices[t], end = indices[t+1];
+      std::vector<unsigned char> prefix(
+          &prefixes[start],
+          &prefixes[end]);
+
+      auto task = [prefix_length, size](std::vector<unsigned char> prefix)
+      {
+        unsigned long res = 0;
+        for (unsigned int i = 0; i < prefix.size(); i += prefix_length) {
+          std::vector<unsigned char> subprefix(prefix.begin() + i,
+                                            prefix.begin() + i + prefix_length);
+          res += nqueens_serial_prefix(size, subprefix); 
+        }
+        return res;
+      };
+
+      auto start_func = std::chrono::high_resolution_clock::now();
+      auto id = cppless::dispatch<cpu_intensive>(
+        instance, task, results[t], {prefix}
+      );
+
+      benchmarker.add_function_start(id, start_func);
+    }
+    auto dispatch_end = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < args.threads; i++) {
+      auto f = instance.wait_one();
+      benchmarker.add_function_result(f);
+    }
+
+    res = std::accumulate(results.begin(), results.end(), 0);
+    auto end = std::chrono::high_resolution_clock::now();
+
+    std::clog << "prefixes: " << prefixes.size() / prefix_length << " result: " << res << std::endl;
+
+    benchmarker.add_result(start, end, "total");
+    benchmarker.add_result(dispatch_start, dispatch_end, "dispatch");
+    benchmarker.add_result(dispatch_end, end, "wait");
+    benchmarker.add_result(start, dispatch_start, "prep");
 
-  std::vector<unsigned char> prefixes;
-  prefixes.reserve(pow(size, prefix_length));
-  std::vector<unsigned char> scratchpad(size);
-
-  nqueens_prefixes(0,
-                   prefix_length,
-                   0,
-                   size,
-                   std::span<unsigned char> {scratchpad},
-                   prefixes);
-  std::size_t num_prefixes = prefixes.size() / prefix_length;
-  std::vector<unsigned long> results(num_prefixes);
-
-  for (unsigned int i = 0; i < num_prefixes; i++) {
-    std::vector<unsigned char> prefix(
-        &prefixes[prefix_length * i],
-        &prefixes[prefix_length * i + prefix_length]);
-
-    auto task = [size](std::vector<unsigned char> prefix)
-    { return nqueens_serial_prefix(size, prefix); };
-    cppless::dispatch<cpu_intensive>(instance, task, results[i], {prefix});
   }
-  cppless::wait(instance, num_prefixes);
-  unsigned long res = std::accumulate(results.begin(), results.end(), 0);
+
+  benchmarker.write(args.output_location);
 
   return res;
 }
diff --git a/benchmarks/bots/nqueens/dispatcher.hpp b/benchmarks/bots/nqueens/dispatcher.hpp
@@ -5,6 +5,9 @@ class dispatcher_args
 public:
   unsigned int size;
   unsigned int prefix_length;
+  int threads;
+  int repetitions;
+  std::string output_location;
 };
 
-auto nqueens(dispatcher_args args) -> unsigned long;
+auto nqueens(dispatcher_args args) -> unsigned long;
diff --git a/benchmarks/bots/nqueens/main.cpp b/benchmarks/bots/nqueens/main.cpp
@@ -34,13 +34,24 @@ __attribute((weak)) auto main(int argc, char* argv[]) -> int
       .help("Use dispatcher")
       .default_value(false)
       .implicit_value(true);
+  program.add_argument("--threads-number")
+      .help("Number of threads")
+      .default_value(1)
+      .scan<'i', int>();
   program.add_argument("--threads-prefix-length")
       .help("Prefix length value when using the dispatcher implementation")
       .default_value(2)
       .scan<'i', unsigned int>();
   program.add_argument("input_size")
       .help("display the square of a given integer")
       .scan<'i', unsigned int>();
+  program.add_argument("-r")
+      .help("number of repetitions")
+      .default_value(1)
+      .scan<'i', int>();
+  program.add_argument("-o")
+      .default_value(std::string(""))
+      .help("location to write output statistics");
 
   try {
     program.parse_args(argc, argv);
@@ -50,6 +61,9 @@ __attribute((weak)) auto main(int argc, char* argv[]) -> int
     std::exit(1);
   }
   auto size = program.get<unsigned int>("input_size");
+  int repetitions = program.get<int>("-r");
+  std::string output_location = program.get<std::string>("-o");
+  auto threads = program.get<int>("--threads-number");
 
   if (program["--serial"] == true) {
     unsigned int res = nqueens(serial_args {.size = size});
@@ -58,7 +72,13 @@ __attribute((weak)) auto main(int argc, char* argv[]) -> int
     auto prefix_length =
         program.get<unsigned int>("--dispatcher-prefix-length");
     unsigned int res =
-        nqueens(dispatcher_args {.size = size, .prefix_length = prefix_length});
+        nqueens(dispatcher_args {
+          .size = size,
+          .prefix_length = prefix_length,
+          .threads = threads,
+          .repetitions = repetitions,
+          .output_location = output_location
+        });
     std::cout << res << std::endl;
   } else if (program["--graph"] == true) {
     auto prefix_length = program.get<unsigned int>("--graph-prefix-length");
@@ -67,8 +87,13 @@ __attribute((weak)) auto main(int argc, char* argv[]) -> int
     std::cout << res << std::endl;
   } else if (program["--threads"] == true) {
     auto prefix_length = program.get<unsigned int>("--threads-prefix-length");
-    unsigned int res =
-        nqueens(threads_args {.size = size, .prefix_length = prefix_length});
+    unsigned int res = nqueens(threads_args {
+        .size = size,
+        .prefix_length = prefix_length,
+        .threads = threads,
+        .repetitions = repetitions,
+        .output_location = output_location
+    });
     std::cout << res << std::endl;
   }
 

diff --git a/benchmarks/bots/nqueens/threads.cpp b/benchmarks/bots/nqueens/threads.cpp
@@ -5,37 +5,79 @@
 
 #include "./threads.hpp"
 
+#include "../../include/measurement.hpp"
+
 #include "./common.hpp"
 
 auto nqueens(threads_args args) -> unsigned int
 {
+  measurements benchmarker;
+
   auto size = args.size;
   auto prefix_length = args.prefix_length;
+  std::atomic<unsigned long> res;
+  //unsigned long res;
 
-  auto prefixes = std::vector<unsigned char>();
-  prefixes.reserve(pow(size, prefix_length));
-  auto scratchpad = std::vector<unsigned char>(size);
+  for(int rep = 0; rep < args.repetitions; ++rep) {
 
-  nqueens_prefixes(0,
-                   prefix_length,
-                   0,
-                   size,
-                   std::span<unsigned char> {scratchpad},
-                   prefixes);
+    res = 0;
+    benchmarker.start_repetition(rep);
 
-  std::vector<std::thread> threads;
-  std::atomic<unsigned long> res;
+    auto start = std::chrono::high_resolution_clock::now();
 
-  for (unsigned int i = 0; i < prefixes.size(); i += prefix_length) {
-    std::vector<unsigned char> prefix(prefixes.begin() + i,
-                                      prefixes.begin() + i + prefix_length);
-    threads.emplace_back([prefix, size, &res]() mutable
-                         { res += nqueens_serial_prefix(size, prefix); });
-  }
+    auto prefixes = std::vector<unsigned char>();
+    prefixes.reserve(pow(size, prefix_length));
+    auto scratchpad = std::vector<unsigned char>(size);
+
+    nqueens_prefixes(0,
+                     prefix_length,
+                     0,
+                     size,
+                     std::span<unsigned char> {scratchpad},
+                     prefixes);
+
+    std::vector<std::thread> threads;
+
+    auto dispatch_start = std::chrono::high_resolution_clock::now();
+    int total_items = prefixes.size() / prefix_length;
+    int work_size = total_items / args.threads;
+    int work_leftover = total_items % args.threads;
+    std::vector<int> indices;
+    int idx = 0;
+    indices.emplace_back(0);
+    for (unsigned int t = 0; t < args.threads; t++) {
+      int new_idx = idx + (t < work_leftover ? work_size + 1 : work_size) * prefix_length;
+      indices.emplace_back(new_idx);
+      idx = new_idx;
+    }
+    indices.emplace_back(idx);
 
-  for (auto& t : threads) {
-    t.join();
+    for (unsigned int t = 0; t < args.threads; t++) {
+
+      int start = indices[t], end = indices[t+1];
+      threads.emplace_back([&prefixes, size, start, end, prefix_length, &res]() mutable {
+
+        for (unsigned int i = start; i < end; i += prefix_length) {
+          std::vector<unsigned char> prefix(prefixes.begin() + i,
+                                            prefixes.begin() + i + prefix_length);
+
+          res += nqueens_serial_prefix(size, prefix); 
+        }
+      });
+    }
+
+    for (auto& t : threads) {
+      t.join();
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+
+    std::clog << "prefixes: " << prefixes.size() / prefix_length << " result: " << res << std::endl;
+
+    benchmarker.add_result(start, end, "total");
+    benchmarker.add_result(start, dispatch_start, "prep");
   }
 
+  benchmarker.write(args.output_location);
+
   return res;
-}
+}
diff --git a/benchmarks/bots/nqueens/threads.hpp b/benchmarks/bots/nqueens/threads.hpp
@@ -5,6 +5,9 @@ class threads_args
 public:
   unsigned int size;
   unsigned int prefix_length;
+  int threads;
+  int repetitions;
+  std::string output_location;
 };
 
-auto nqueens(threads_args args) -> unsigned int;
+auto nqueens(threads_args args) -> unsigned int;
diff --git a/benchmarks/custom/CMakeLists.txt b/benchmarks/custom/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 
 find_package(ut REQUIRED)
 
+add_subdirectory(invocations)
 add_subdirectory(serialization)
 add_subdirectory(ray)
-add_subdirectory(pi)
+add_subdirectory(pi)
diff --git a/benchmarks/custom/invocations/CMakeLists.txt b/benchmarks/custom/invocations/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.14)
+
+project(cpplessBenchmarksCustomRay CXX)
+
+add_executable("benchmark_custom_invocations" dispatcher.cpp)
+target_compile_options("benchmark_custom_invocations" PRIVATE "-ffast-math")
+target_link_libraries("benchmark_custom_invocations" PRIVATE cppless::cppless)
+target_compile_features("benchmark_custom_invocations" PRIVATE cxx_std_20)
+aws_lambda_target("benchmark_custom_invocations")
+aws_lambda_serverless_target("benchmark_custom_invocations")