Skip to content

Commit 29887f2

Browse files
author
tziegler
committed
Publish
1 parent 54bc712 commit 29887f2

File tree

180 files changed

+25851
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

180 files changed

+25851
-0
lines changed

.clang-format

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
BasedOnStyle: Chromium
3+
BreakBeforeBraces: Linux
4+
SpaceInEmptyParentheses: 'false'
5+
ColumnLimit: 140
6+
IndentWidth: 3
7+
AllowShortBlocksOnASingleLine: true
8+
AllowShortIfStatementsOnASingleLine: true
9+
BreakBeforeBraces: Custom
10+
...

.gitignore

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
.idea/
2+
bin/
3+
build*/
4+
cmake-*/
5+
.DS_Store
6+
a.out
7+
data/
8+
log/
9+
adhoc/
10+
#release/
11+
#debug/
12+
log.txt
13+
git-ignore
14+
*#
15+
debug*
16+
release*
17+
clang*
18+
paper*
19+
.dir-locals.el
20+
compile_commands.json
21+
CMakeFiles/

CMakeLists.txt

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# ---------------------------------------------------------------------------
2+
# NAM DB
3+
# ---------------------------------------------------------------------------
4+
5+
project(namdb)
6+
cmake_minimum_required(VERSION 3.7)
7+
8+
# ---------------------------------------------------------------------------
9+
# Environment
10+
# ---------------------------------------------------------------------------
11+
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
12+
set(CMAKE_CXX_STANDARD 17)
13+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
14+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -g")
15+
16+
if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
17+
add_compile_options(-Wall -Wextra -Werror -fnon-call-exceptions -fasynchronous-unwind-tables -mavx2 -mcx16 -m64) # -fno-elide-constructors no need for now
18+
else()
19+
add_compile_options(-Wall -Wextra -march=native)
20+
endif()
21+
22+
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG")
23+
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
24+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
25+
26+
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
27+
message("${Red}!ATTENTION: debug mode enabled! ${ColourReset}")
28+
else()
29+
message("${BoldGreen}Release mode: ${ColourReset} " ${CMAKE_BUILD_TYPE})
30+
endif (CMAKE_BUILD_TYPE STREQUAL "Debug")
31+
32+
33+
find_package(Threads REQUIRED)
34+
set(THREADS_PREFER_PTHREAD_FLAG ON)
35+
36+
# ---------------------------------------------------------------------------
37+
# Includes
38+
# ---------------------------------------------------------------------------
39+
include("${CMAKE_SOURCE_DIR}/shared-headers/local.cmake")
40+
include("${CMAKE_SOURCE_DIR}/libs/gflags.cmake")
41+
# ---------------------------------------------------------------------------
42+
# Sources
43+
# ---------------------------------------------------------------------------
44+
45+
add_subdirectory("backend")
46+
47+
# ---------------------------------------------------------------------------
48+
# Executable
49+
# ---------------------------------------------------------------------------
50+
51+
add_subdirectory("frontend")

README.org

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
* Design Guidelines for Correct, Efficient, and Scalable Synchronization using One-Sided RDMA
2+
3+
This is the source code for our SIGMOD 2023 Paper:
4+
5+
** Paper Abstract
6+
Remote data structures built with one-sided Remote Direct Memory Access (RDMA) are at the heart of many disaggregated database management systems today. Concurrent access to these data structures by thousands of remote workers necessitates a highly efficient synchronization scheme. Remarkably, our investigation reveals that existing synchronization schemes display substantial variations in performance and scalability. Even worse, some schemes do not correctly synchronize, resulting in rare and hard-to-detect data corruption. Motivated by these observations, we conduct the first comprehensive analysis of one-sided synchronization techniques and provide general principles for correct synchronization using one-sided RDMA. Our research demonstrates that adherence to these principles not only guarantees correctness but also results in substantial performance enhancements.
7+
8+
** Citation
9+
10+
11+
#+begin_src
12+
@article{10.1145/3589276,
13+
author = {Ziegler, Tobias and Nelson-Slivon, Jacob and Leis, Viktor and Binnig, Carsten},
14+
title = {Design Guidelines for Correct, Efficient, and Scalable Synchronization Using One-Sided RDMA},
15+
year = {2023},
16+
url = {https://doi.org/10.1145/3589276},
17+
doi = {10.1145/3589276},
18+
journal = {Proc. ACM Manag. Data},
19+
}
20+
#+end_src
21+
22+
** Benchmarks
23+
The benchmarks and lock implementations can be found in `frontend`.
24+
The experiment scripts can be found in `distexperiments/experiments`
25+
26+
** Setup
27+
28+
*** Cluster Setup
29+
All experiments were conducted on a 5-node cluster running Ubuntu 18.04.1 LTS, with Linux 4.15.0 kernel.
30+
Each node is equipped with two Intel(R) Xeon(R) Gold 5120 CPUs (14 cores), 512 GB main-memory split between both sockets, and four Samsung
31+
SSD 980 Pro M.2 1 TB connected via PCIe by one ASRock Hyper Quad M.2 PCIe card.
32+
The nodes of the cluster are connected with an InfiniBand network using one Mellanox ConnectX-5 MT27800 NICs (InfiniBand EDR 4x, 100 Gbps) per node.
33+
34+
*** Mellanox RDMA
35+
We used the following Mellanox OFED installation:
36+
37+
**** ofed_info
38+
#+begin_src shell
39+
MLNX_OFED_LINUX-5.1-2.5.8.0 (OFED-5.1-2.5.8):
40+
Installed Packages:
41+
-------------------
42+
ii ar-mgr 1.0-0.3.MLNX20200824.g8577618.51258 amd64 Adaptive Routing Manager
43+
ii dapl2-utils 2.1.10.1.mlnx-OFED.51258 amd64 Utilities for use with the DAPL libraries
44+
ii dpcp 1.1.0-1.51258 amd64 Direct Packet Control Plane (DPCP) is a library to use Devx
45+
ii dump-pr 1.0-0.3.MLNX20200824.g8577618.51258 amd64 Dump PathRecord Plugin
46+
ii hcoll 4.6.3125-1.51258 amd64 Hierarchical collectives (HCOLL)
47+
ii ibacm 51mlnx1-1.51258 amd64 InfiniBand Communication Manager Assistant (ACM)
48+
ii ibdump 6.0.0-1.51258 amd64 Mellanox packets sniffer tool
49+
ii ibsim 0.9-1.51258 amd64 InfiniBand fabric simulator for management
50+
ii ibsim-doc 0.9-1.51258 all documentation for ibsim
51+
ii ibutils2 2.1.1-0.126.MLNX20200721.gf95236b.51258 amd64 OpenIB Mellanox InfiniBand Diagnostic Tools
52+
ii ibverbs-providers:amd64 51mlnx1-1.51258 amd64 User space provider drivers for libibverbs
53+
ii ibverbs-utils 51mlnx1-1.51258 amd64 Examples for the libibverbs library
54+
ii infiniband-diags 51mlnx1-1.51258 amd64 InfiniBand diagnostic programs
55+
ii iser-dkms 5.1-OFED.5.1.2.5.3.1 all DKMS support fo iser kernel modules
56+
ii isert-dkms 5.1-OFED.5.1.2.5.3.1 all DKMS support fo isert kernel modules
57+
ii kernel-mft-dkms 4.15.1-100 all DKMS support for kernel-mft kernel modules
58+
ii knem 1.1.4.90mlnx1-OFED.5.1.2.5.0.1 amd64 userspace tools for the KNEM kernel module
59+
ii knem-dkms 1.1.4.90mlnx1-OFED.5.1.2.5.0.1 all DKMS support for mlnx-ofed kernel modules
60+
ii libdapl-dev 2.1.10.1.mlnx-OFED.51258 amd64 Development files for the DAPL libraries
61+
ii libdapl2 2.1.10.1.mlnx-OFED.51258 amd64 The Direct Access Programming Library (DAPL)
62+
ii libibmad-dev:amd64 51mlnx1-1.51258 amd64 Development files for libibmad
63+
ii libibmad5:amd64 51mlnx1-1.51258 amd64 Infiniband Management Datagram (MAD) library
64+
ii libibnetdisc5:amd64 51mlnx1-1.51258 amd64 InfiniBand diagnostics library
65+
ii libibumad-dev:amd64 51mlnx1-1.51258 amd64 Development files for libibumad
66+
ii libibumad3:amd64 51mlnx1-1.51258 amd64 InfiniBand Userspace Management Datagram (uMAD) library
67+
ii libibverbs-dev:amd64 51mlnx1-1.51258 amd64 Development files for the libibverbs library
68+
ii libibverbs1:amd64 51mlnx1-1.51258 amd64 Library for direct userspace use of RDMA (InfiniBand/iWARP)
69+
ii libibverbs1-dbg:amd64 51mlnx1-1.51258 amd64 Debug symbols for the libibverbs library
70+
ii libopensm 5.7.3.MLNX20201102.e56fd90-0.1.51258 amd64 Infiniband subnet manager libraries
71+
ii libopensm-devel 5.7.3.MLNX20201102.e56fd90-0.1.51258 amd64 Developement files for OpenSM
72+
ii librdmacm-dev:amd64 51mlnx1-1.51258 amd64 Development files for the librdmacm library
73+
ii librdmacm1:amd64 51mlnx1-1.51258 amd64 Library for managing RDMA connections
74+
ii mlnx-ethtool 5.4-1.51258 amd64 This utility allows querying and changing settings such as speed,
75+
ii mlnx-iproute2 5.6.0-1.51258 amd64 This utility allows querying and changing settings such as speed,
76+
ii mlnx-ofed-kernel-dkms 5.1-OFED.5.1.2.5.8.1 all DKMS support for mlnx-ofed kernel modules
77+
ii mlnx-ofed-kernel-utils 5.1-OFED.5.1.2.5.8.1 amd64 Userspace tools to restart and tune mlnx-ofed kernel modules
78+
ii mpitests 3.2.20-5d20b49.51258 amd64 Set of popular MPI benchmarks and tools IMB 2018 OSU benchmarks ver 4.0.1 mpiP-3.3 IPM-2.0.6
79+
ii mstflint 4.14.0-3.51258 amd64 Mellanox firmware burning application
80+
ii openmpi 4.0.4rc3-1.51258 all Open MPI
81+
ii opensm 5.7.3.MLNX20201102.e56fd90-0.1.51258 amd64 An Infiniband subnet manager
82+
ii opensm-doc 5.7.3.MLNX20201102.e56fd90-0.1.51258 amd64 Documentation for opensm
83+
ii perftest 4.4+0.5-1 amd64 Infiniband verbs performance tests
84+
ii rdma-core 51mlnx1-1.51258 amd64 RDMA core userspace infrastructure and documentation
85+
ii rdmacm-utils 51mlnx1-1.51258 amd64 Examples for the librdmacm library
86+
ii sharp 2.2.2.MLNX20201102.b26a0fd-1.51258 amd64 SHArP switch collectives
87+
ii srp-dkms 5.1-OFED.5.1.2.5.3.1 all DKMS support fo srp kernel modules
88+
ii srptools 51mlnx1-1.51258 amd64 Tools for Infiniband attached storage (SRP)
89+
ii ucx 1.9.0-1.51258 amd64 Unified Communication X
90+
#+end_src
91+
92+
93+
*** Libraries
94+
- gflags
95+
- lib_aio
96+
- ibverbs
97+
- tabulate
98+
- rdma cm
99+

backend/.#CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

backend/CMakeLists.txt

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# ---------------------------------------------------------------------------
2+
# NAM-DB
3+
# ---------------------------------------------------------------------------
4+
project(nam)
5+
6+
# ---------------------------------------------------------------------------
7+
# Files
8+
# ---------------------------------------------------------------------------
9+
file(GLOB_RECURSE NAM_CC **.cpp **/**.cpp **.hpp **/**.hpp)
10+
if (NOT UNIX)
11+
message(SEND_ERROR "unsupported platform")
12+
endif ()
13+
14+
# ---------------------------------------------------------------------------
15+
# Library
16+
# ---------------------------------------------------------------------------
17+
add_library(nam STATIC ${NAM_CC})
18+
19+
OPTION(SANI "Compile nam with sanitizers" OFF)
20+
IF(SANI)
21+
if (CMAKE_BUILD_TYPE MATCHES Debug)
22+
message("Compiling with Sanitizers")
23+
target_compile_options(nam PUBLIC -fsanitize=address)
24+
target_link_libraries(nam asan)
25+
endif ()
26+
ENDIF(SANI)
27+
28+
target_link_libraries(nam gflags Threads::Threads atomic numa rdmacm ibverbs aio)
29+
target_include_directories(nam PUBLIC ${SHARED_INCLUDE_DIRECTORY})
30+
target_include_directories(nam PRIVATE ${CMAKE_CURRENT_LIST_DIR})
31+
# ---------------------------------------------------------------------------
32+
set(NAM_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR})
33+
set_property(TARGET nam APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${NAM_INCLUDE_DIR})

backend/nam/Compute.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#include "Compute.hpp"
2+
// -------------------------------------------------------------------------------------
3+
#include <fcntl.h>
4+
#include <linux/fs.h>
5+
#include <stdio.h>
6+
#include <sys/ioctl.h>
7+
#include <termios.h>
8+
#include <unistd.h>
9+
10+
namespace nam {
11+
Compute::Compute() {
12+
cm = std::make_unique<rdma::CM<rdma::InitMessage>>();
13+
rdmaCounters = std::make_unique<profiling::RDMACounters>();
14+
workerPool = std::make_unique<threads::WorkerPool>(*cm, 0);
15+
}
16+
17+
Compute::~Compute() {
18+
workerPool.reset();
19+
}
20+
} // namespace nam

backend/nam/Compute.hpp

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#pragma once
2+
// -------------------------------------------------------------------------------------
3+
#include "profiling/ProfilingThread.hpp"
4+
#include "profiling/counters/RDMACounters.hpp"
5+
#include "rdma/CommunicationManager.hpp"
6+
#include "threads/CoreManager.hpp"
7+
#include "threads/WorkerPool.hpp"
8+
#include "nam/utils/RandomGenerator.hpp"
9+
// -------------------------------------------------------------------------------------
10+
#include <memory>
11+
12+
namespace nam
13+
{
14+
// -------------------------------------------------------------------------------------
15+
class Compute
16+
{
17+
18+
public:
19+
//! Default constructor
20+
Compute();
21+
//! Destructor
22+
~Compute();
23+
// -------------------------------------------------------------------------------------
24+
// Deleted constructors
25+
//! Copy constructor
26+
Compute(const Compute& other) = delete;
27+
//! Move constructor
28+
Compute(Compute&& other) noexcept = delete;
29+
//! Copy assignment operator
30+
Compute& operator=(const Compute& other) = delete;
31+
//! Move assignment operator
32+
Compute& operator=(Compute&& other) noexcept = delete;
33+
// -------------------------------------------------------------------------------------
34+
rdma::CM<rdma::InitMessage>& getCM() { return *cm; }
35+
// -------------------------------------------------------------------------------------
36+
threads::WorkerPool& getWorkerPool(){
37+
return *workerPool;
38+
}
39+
// -------------------------------------------------------------------------------------
40+
void startProfiler(profiling::WorkloadInfo& wlInfo) {
41+
pt.running = true;
42+
profilingThread.emplace_back(&profiling::ProfilingThread::profile, &pt, 0, std::ref(wlInfo));
43+
}
44+
// -------------------------------------------------------------------------------------
45+
void stopProfiler()
46+
{
47+
if (pt.running == true) {
48+
pt.running = false;
49+
for (auto& p : profilingThread)
50+
p.join();
51+
profilingThread.clear();
52+
}
53+
std::locale::global(std::locale("C")); // hack to restore locale which is messed up in tabulate package
54+
};
55+
56+
private:
57+
std::unique_ptr<rdma::CM<rdma::InitMessage>> cm;
58+
std::unique_ptr<profiling::RDMACounters> rdmaCounters;
59+
profiling::ProfilingThread pt;
60+
std::vector<std::thread> profilingThread;
61+
std::unique_ptr<threads::WorkerPool> workerPool;
62+
};
63+
// -------------------------------------------------------------------------------------
64+
} // namespace scalestore

backend/nam/Config.cpp

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// -------------------------------------------------------------------------------------
2+
#include "gflags/gflags.h"
3+
// -------------------------------------------------------------------------------------
4+
DEFINE_double(dramGB, 1, "DRAM buffer pool size");
5+
DEFINE_uint64(worker,1, "Number worker threads");
6+
DEFINE_uint64(all_worker,1, "number of all worker threads in the cluster for barrier");
7+
DEFINE_uint64(batchSize, 100, "batch size in free lists");
8+
DEFINE_uint64(pageProviderThreads, 2, " Page Provider threads must be power two");
9+
DEFINE_double(freePercentage, 1, "Percentage free for PP");
10+
DEFINE_uint64(coolingPercentage, 10 , "Percentage cooling for PP");
11+
DEFINE_double(evictCoolestEpochs, 0.1, "Percentage of coolest epchos choosen for eviction");
12+
DEFINE_bool(csv, true , "If written to csv file or not");
13+
DEFINE_string(csvFile, "stats.csv" , "filename for profiling output");
14+
DEFINE_string(tag,"","descirption of experiment");
15+
DEFINE_uint32(partitionBits, 6, "bits per partition");
16+
DEFINE_uint32(page_pool_partitions, 8, "page pool partitions each is shifted by 512 byte to increase cache associativity");
17+
// -------------------------------------------------------------------------------------
18+
DEFINE_bool(backoff, true, "backoff enabled");
19+
// -------------------------------------------------------------------------------------
20+
DEFINE_bool(storage_node, false, "storage node");
21+
DEFINE_uint64(storage_nodes, 1,"Number nodes participating");
22+
DEFINE_double(rdmaMemoryFactor, 1.1, "Factor to be multiplied by dramGB"); // factor to be multiplied by dramGB
23+
DEFINE_uint32(port, 7174, "port");
24+
DEFINE_string(ownIp, "172.18.94.80", "own IP server");
25+
// -------------------------------------------------------------------------------------
26+
DEFINE_uint64(pollingInterval, 16, " Number of unsignaled messages before a signaled (power of 2)");
27+
DEFINE_bool(read, true, "read protocol");
28+
DEFINE_bool(random, false, "use random pages");
29+
DEFINE_uint64(messageHandlerThreads, 4, " number message handler ");
30+
DEFINE_uint64(messageHandlerMaxRetries, 10, "Number retries before message gets restarted at client"); // prevents deadlocks but also mitigates early aborts
31+
// -------------------------------------------------------------------------------------
32+
DEFINE_uint32(sockets, 2 , "Number Sockets");
33+
DEFINE_uint32(socket, 0, " Socket we are running on");
34+
DEFINE_bool(pinThreads, true, " Pin threads");
35+
DEFINE_bool(cpuCounters,true, " CPU counters profiling ");

0 commit comments

Comments
 (0)