From 461a13ffc6c0c0bc77b2ddf475fe19c101e2fd15 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 21 Sep 2024 21:09:13 +0800 Subject: [PATCH 1/9] Fix default value of primary-reboot-down-after-period in sentinel.conf (#1040) Since in here the monitor value is mymaster, we need to make sure the primary name is the same, otherwise the default configuration cannot start sentinel. ``` sentinel monitor mymaster 127.0.0.1 6379 2 ``` The following error occurs when the default configuration is started: ``` *** FATAL CONFIG FILE ERROR (Version 255.255.255) *** Reading the configuration file, at line 358 >>> 'SENTINEL primary-reboot-down-after-period myprimary 0' No such master with specified name. ``` Introduced in #647. Signed-off-by: Binbin --- sentinel.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentinel.conf b/sentinel.conf index dcdd7d3d01..4c98a090ab 100644 --- a/sentinel.conf +++ b/sentinel.conf @@ -355,4 +355,4 @@ SENTINEL announce-hostnames no # accept a -LOADING response after a primary has been rebooted, before failing # over. -SENTINEL primary-reboot-down-after-period myprimary 0 +SENTINEL primary-reboot-down-after-period mymaster 0 From 3e83653afd466915bcbcc25e9690beedfed8c9d9 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Wed, 18 Sep 2024 14:35:35 +0300 Subject: [PATCH 2/9] Fix memory allocation for server databases (#1046) Fix a bug in the way we allocate memory for the server databases Introduced in #156. Signed-off-by: Ran Shidlansik --- src/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index 8970c43724..6a2d46d974 100644 --- a/src/server.c +++ b/src/server.c @@ -2636,7 +2636,7 @@ void initServer(void) { serverLog(LL_WARNING, "Failed creating the event loop. Error message: '%s'", strerror(errno)); exit(1); } - server.db = zmalloc(sizeof(server) * server.dbnum); + server.db = zmalloc(sizeof(serverDb) * server.dbnum); /* Create the databases, and initialize other internal state. */ int slot_count_bits = 0; From 3e3b955f8f965e9e60e533ff5ccb31d58da6d966 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 22 Sep 2024 20:20:55 +0800 Subject: [PATCH 3/9] Use _Thread_local to solve threads.h build issue (#1053) Apparently this will fail to compile in some masOS version. And internet claims _Thread_local is portable. Fixes #1051. Signed-off-by: Binbin --- src/zmalloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/zmalloc.c b/src/zmalloc.c index 7b19107b66..7f9d7f6888 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -88,11 +88,7 @@ void zlibc_free(void *ptr) { #define dallocx(ptr, flags) je_dallocx(ptr, flags) #endif -#if __STDC_NO_THREADS__ -#define thread_local __thread -#else -#include -#endif +#define thread_local _Thread_local #define MAX_THREADS_NUM (IO_THREADS_MAX_NUM + 3 + 1) /* A thread-local storage which keep the current thread's index in the used_memory_thread array. */ From b6744f2b1edeb48413b78239a5c7768db6352c9c Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 25 Sep 2024 14:50:48 +0800 Subject: [PATCH 4/9] Fix module / script call CLUSTER SLOTS / SHARDS fake client check crash (#1063) The reason is VM_Call will use a fake client without connection, so we also need to check if c->conn is NULL. This also affects scripts. If they are called in the script, the server will crash. Injecting commands into AOF will also cause startup failure. Fixes #1054. Signed-off-by: Binbin --- src/aof.c | 1 + src/eval.c | 1 + src/functions.c | 1 + src/module.c | 5 +++- src/networking.c | 7 +++-- src/server.h | 40 ++++++++++++------------- tests/integration/aof.tcl | 23 ++++++++++++++ tests/modules/Makefile | 3 +- tests/modules/cluster.c | 51 ++++++++++++++++++++++++++++++++ tests/unit/cluster/scripting.tcl | 8 +++++ tests/unit/moduleapi/cluster.tcl | 24 +++++++++++++-- 11 files changed, 138 insertions(+), 26 deletions(-) create mode 100644 tests/modules/cluster.c diff --git a/src/aof.c b/src/aof.c index f48c8bd1bc..e712295127 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1371,6 +1371,7 @@ struct client *createAOFClient(void) { */ c->raw_flag = 0; c->flag.deny_blocking = 1; + c->flag.fake = 1; /* We set the fake client as a replica waiting for the synchronization * so that the server will not try to send replies to this client. */ diff --git a/src/eval.c b/src/eval.c index 580c35bdcc..3b8390af78 100644 --- a/src/eval.c +++ b/src/eval.c @@ -260,6 +260,7 @@ void scriptingInit(int setup) { if (lctx.lua_client == NULL) { lctx.lua_client = createClient(NULL); lctx.lua_client->flag.script = 1; + lctx.lua_client->flag.fake = 1; /* We do not want to allow blocking commands inside Lua */ lctx.lua_client->flag.deny_blocking = 1; diff --git a/src/functions.c b/src/functions.c index 38d0634927..985d8b3e1a 100644 --- a/src/functions.c +++ b/src/functions.c @@ -408,6 +408,7 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) { client *c = createClient(NULL); c->flag.deny_blocking = 1; c->flag.script = 1; + c->flag.fake = 1; engineInfo *ei = zmalloc(sizeof(*ei)); *ei = (engineInfo){ .name = engine_name_sds, diff --git a/src/module.c b/src/module.c index 24cc6a42e7..938ae6ba70 100644 --- a/src/module.c +++ b/src/module.c @@ -656,6 +656,7 @@ client *moduleAllocTempClient(void) { } else { c = createClient(NULL); c->flag.module = 1; + c->flag.fake = 1; c->user = NULL; /* Root user */ } return c; @@ -894,8 +895,10 @@ void moduleCreateContext(ValkeyModuleCtx *out_ctx, ValkeyModule *module, int ctx out_ctx->flags = ctx_flags; if (ctx_flags & VALKEYMODULE_CTX_TEMP_CLIENT) out_ctx->client = moduleAllocTempClient(); - else if (ctx_flags & VALKEYMODULE_CTX_NEW_CLIENT) + else if (ctx_flags & VALKEYMODULE_CTX_NEW_CLIENT) { out_ctx->client = createClient(NULL); + out_ctx->client->flag.fake = 1; + } /* Calculate the initial yield time for long blocked contexts. * in loading we depend on the server hz, but in other cases we also wait diff --git a/src/networking.c b/src/networking.c index 44a94087c9..eac1022696 100644 --- a/src/networking.c +++ b/src/networking.c @@ -346,6 +346,7 @@ client *createCachedResponseClient(int resp) { /* Allocating the `conn` allows to prepare the caching client before adding * data to the clients output buffer by `prepareClientToWrite`. */ recording_client->conn = zcalloc(sizeof(connection)); + recording_client->flag.fake = 1; return recording_client; } @@ -3246,8 +3247,10 @@ char *getClientSockname(client *c) { int isClientConnIpV6(client *c) { /* The cached client peer id is on the form "[IPv6]:port" for IPv6 * addresses, so we just check for '[' here. */ - if (c->conn->type == NULL && server.current_client) { - /* Fake client? Use current client instead. */ + if (c->flag.fake && server.current_client) { + /* Fake client? Use current client instead. + * Noted that in here we are assuming server.current_client is set + * and real (aof has already violated this in loadSingleAppendOnlyFil). */ c = server.current_client; } return getClientPeerId(c)[0] == '['; diff --git a/src/server.h b/src/server.h index a5cee03055..a7cf8635a0 100644 --- a/src/server.h +++ b/src/server.h @@ -1223,26 +1223,26 @@ typedef struct ClientFlags { uint64_t reprocessing_command : 1; /* The client is re-processing the command. */ uint64_t replication_done : 1; /* Indicate that replication has been done on the client */ uint64_t authenticated : 1; /* Indicate a client has successfully authenticated */ - uint64_t - protected_rdb_channel : 1; /* Dual channel replication sync: Protects the RDB client from premature \ - * release during full sync. This flag is used to ensure that the RDB client, which \ - * references the first replication data block required by the replica, is not \ - * released prematurely. Protecting the client is crucial for prevention of \ - * synchronization failures: \ - * If the RDB client is released before the replica initiates PSYNC, the primary \ - * will reduce the reference count (o->refcount) of the block needed by the replica. - * \ - * This could potentially lead to the removal of the required data block, resulting \ - * in synchronization failures. Such failures could occur even in scenarios where \ - * the replica only needs an additional 4KB beyond the minimum size of the - * repl_backlog. - * By using this flag, we ensure that the RDB client remains intact until the replica - * \ has successfully initiated PSYNC. */ - uint64_t repl_rdb_channel : 1; /* Dual channel replication sync: track a connection which is used for rdb snapshot */ - uint64_t dont_cache_primary : 1; /* In some cases we don't want to cache the primary. For example, the replica - * knows that it does not need the cache and required a full sync. With this - * flag, we won't cache the primary in freeClient. */ - uint64_t reserved : 6; /* Reserved for future use */ + uint64_t protected_rdb_channel : 1; /* Dual channel replication sync: Protects the RDB client from premature \ + * release during full sync. This flag is used to ensure that the RDB client, which \ + * references the first replication data block required by the replica, is not \ + * released prematurely. Protecting the client is crucial for prevention of \ + * synchronization failures: \ + * If the RDB client is released before the replica initiates PSYNC, the primary \ + * will reduce the reference count (o->refcount) of the block needed by the replica. + * \ + * This could potentially lead to the removal of the required data block, resulting \ + * in synchronization failures. Such failures could occur even in scenarios where \ + * the replica only needs an additional 4KB beyond the minimum size of the + * repl_backlog. + * By using this flag, we ensure that the RDB client remains intact until the replica + * \ has successfully initiated PSYNC. */ + uint64_t repl_rdb_channel : 1; /* Dual channel replication sync: track a connection which is used for rdb snapshot */ + uint64_t dont_cache_primary : 1; /* In some cases we don't want to cache the primary. For example, the replica + * knows that it does not need the cache and required a full sync. With this + * flag, we won't cache the primary in freeClient. */ + uint64_t fake : 1; /* This is a fake client without a real connection. */ + uint64_t reserved : 5; /* Reserved for future use */ } ClientFlags; typedef struct client { diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index 72fae9915b..33c7c12d4b 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -673,3 +673,26 @@ tags {"aof external:skip"} { } } } + +# make sure the test infra won't use SELECT +set old_singledb $::singledb +set ::singledb 1 + +tags {"aof cluster external:skip"} { + test {Test cluster slots / cluster shards in aof won't crash} { + create_aof $aof_dirpath $aof_file { + append_to_aof [formatCommand cluster slots] + append_to_aof [formatCommand cluster shards] + } + + create_aof_manifest $aof_dirpath $aof_manifest_file { + append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n" + } + + start_server_aof [list dir $server_path cluster-enabled yes] { + assert_equal [r ping] {PONG} + } + } +} + +set ::singledb $old_singledb diff --git a/tests/modules/Makefile b/tests/modules/Makefile index 9966b8840e..1690b9b627 100644 --- a/tests/modules/Makefile +++ b/tests/modules/Makefile @@ -63,7 +63,8 @@ TEST_MODULES = \ postnotifications.so \ moduleauthtwo.so \ rdbloadsave.so \ - crash.so + crash.so \ + cluster.so .PHONY: all diff --git a/tests/modules/cluster.c b/tests/modules/cluster.c new file mode 100644 index 0000000000..b3b53d5d93 --- /dev/null +++ b/tests/modules/cluster.c @@ -0,0 +1,51 @@ +#include "valkeymodule.h" + +#define UNUSED(x) (void)(x) + +int test_cluster_slots(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + UNUSED(argv); + + if (argc != 1) return ValkeyModule_WrongArity(ctx); + + ValkeyModuleCallReply *rep = ValkeyModule_Call(ctx, "CLUSTER", "c", "SLOTS"); + if (!rep) { + ValkeyModule_ReplyWithError(ctx, "ERR NULL reply returned"); + } else { + ValkeyModule_ReplyWithCallReply(ctx, rep); + ValkeyModule_FreeCallReply(rep); + } + + return VALKEYMODULE_OK; +} + +int test_cluster_shards(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + UNUSED(argv); + + if (argc != 1) return ValkeyModule_WrongArity(ctx); + + ValkeyModuleCallReply *rep = ValkeyModule_Call(ctx, "CLUSTER", "c", "SHARDS"); + if (!rep) { + ValkeyModule_ReplyWithError(ctx, "ERR NULL reply returned"); + } else { + ValkeyModule_ReplyWithCallReply(ctx, rep); + ValkeyModule_FreeCallReply(rep); + } + + return VALKEYMODULE_OK; +} + +int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + VALKEYMODULE_NOT_USED(argv); + VALKEYMODULE_NOT_USED(argc); + + if (ValkeyModule_Init(ctx, "cluster", 1, VALKEYMODULE_APIVER_1)== VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + if (ValkeyModule_CreateCommand(ctx, "test.cluster_slots", test_cluster_slots, "", 0, 0, 0) == VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + if (ValkeyModule_CreateCommand(ctx, "test.cluster_shards", test_cluster_shards, "", 0, 0, 0) == VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + return VALKEYMODULE_OK; +} diff --git a/tests/unit/cluster/scripting.tcl b/tests/unit/cluster/scripting.tcl index 1cf1421079..88e158afc5 100644 --- a/tests/unit/cluster/scripting.tcl +++ b/tests/unit/cluster/scripting.tcl @@ -88,4 +88,12 @@ start_cluster 1 0 {tags {external:skip cluster}} { assert_match {*Can not run script on cluster, 'no-cluster' flag is set*} $e } + + test "Calling cluster slots in scripts is OK" { + assert_equal [lsort [r 0 cluster slots]] [lsort [r 0 eval "return redis.call('cluster', 'slots')" 0]] + } + + test "Calling cluster shards in scripts is OK" { + assert_equal [lsort [r 0 cluster shards]] [lsort [r 0 eval "return redis.call('cluster', 'shards')" 0]] + } } diff --git a/tests/unit/moduleapi/cluster.tcl b/tests/unit/moduleapi/cluster.tcl index 5570f980f2..5e4244d684 100644 --- a/tests/unit/moduleapi/cluster.tcl +++ b/tests/unit/moduleapi/cluster.tcl @@ -219,8 +219,6 @@ start_cluster 2 2 [list config_lines $modules] { } } -} - set testmodule [file normalize tests/modules/basics.so] set modules [list loadmodule $testmodule] start_cluster 3 0 [list config_lines $modules] { @@ -234,3 +232,25 @@ start_cluster 3 0 [list config_lines $modules] { assert_equal {PONG} [$node3 PING] } } + +set testmodule [file normalize tests/modules/cluster.so] +set modules [list loadmodule $testmodule] +start_cluster 3 0 [list config_lines $modules] { + set node1 [srv 0 client] + set node2 [srv -1 client] + set node3 [srv -2 client] + + test "VM_CALL with cluster slots" { + assert_equal [lsort [$node1 cluster slots]] [lsort [$node1 test.cluster_slots]] + assert_equal [lsort [$node2 cluster slots]] [lsort [$node2 test.cluster_slots]] + assert_equal [lsort [$node3 cluster slots]] [lsort [$node3 test.cluster_slots]] + } + + test "VM_CALL with cluster shards" { + assert_equal [lsort [$node1 cluster shards]] [lsort [$node1 test.cluster_shards]] + assert_equal [lsort [$node2 cluster shards]] [lsort [$node2 test.cluster_shards]] + assert_equal [lsort [$node3 cluster shards]] [lsort [$node3 test.cluster_shards]] + } +} + +} ;# end tag From 20d438dcc1538fb0e70763b559048d1759258ef0 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 23 Sep 2024 13:14:09 +0800 Subject: [PATCH 5/9] Print an empty primary log when primary lost its last slot (#1064) The one in CLUSTER SETSLOT help us keep track of state better, of course it also can make the test case happy. The one in gossip process fixes a problem that a replica can print a log saying it is an empty primary. Signed-off-by: Binbin Co-authored-by: Ping Xie --- src/cluster_legacy.c | 15 +++++++++++++-- tests/unit/cluster/replica-migration.tcl | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 469530adcc..643058ca39 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2670,7 +2670,7 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * If the sender and myself are in the same shard, try psync. */ clusterSetPrimary(sender, !are_in_same_shard, !are_in_same_shard); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); - } else if ((sender_slots >= migrated_our_slots) && !are_in_same_shard) { + } else if (nodeIsPrimary(myself) && (sender_slots >= migrated_our_slots) && !are_in_same_shard) { /* When all our slots are lost to the sender and the sender belongs to * a different shard, this is likely due to a client triggered slot * migration. Don't reconfigure this node to migrate to the new shard @@ -6419,11 +6419,12 @@ void clusterCommandSetSlot(client *c) { int slot_was_mine = server.cluster->slots[slot] == my_primary; clusterDelSlot(slot); clusterAddSlot(n, slot); + int shard_is_empty = my_primary->numslots == 0; /* If replica migration is allowed, check if the primary of this shard * loses its last slot and the shard becomes empty. In this case, we * should turn into a replica of the new primary. */ - if (server.cluster_allow_replica_migration && slot_was_mine && my_primary->numslots == 0) { + if (server.cluster_allow_replica_migration && slot_was_mine && shard_is_empty) { serverAssert(n != my_primary); serverLog(LL_NOTICE, "Lost my last slot during slot migration. Reconfiguring myself " @@ -6439,6 +6440,16 @@ void clusterCommandSetSlot(client *c) { clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); } + /* If replica migration is not allowed, check if the primary of this shard + * loses its last slot and the shard becomes empty. In this case, we will + * print the exact same log as during the gossip process. */ + if (!server.cluster_allow_replica_migration && nodeIsPrimary(myself) && slot_was_mine && shard_is_empty) { + serverAssert(n != my_primary); + serverLog(LL_NOTICE, + "My last slot was migrated to node %.40s (%s) in shard %.40s. I am now an empty primary.", + n->name, n->human_nodename, n->shard_id); + } + /* If this node or this node's primary was importing this slot, * assigning the slot to itself also clears the importing status. */ if ((n == myself || n == myself->replicaof) && server.cluster->importing_slots_from[slot]) { diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl index b40bfcec22..05d6528684 100644 --- a/tests/unit/cluster/replica-migration.tcl +++ b/tests/unit/cluster/replica-migration.tcl @@ -372,7 +372,7 @@ proc test_cluster_setslot {type} { fail "valkey-cli --cluster rebalance returns non-zero exit code, output below:\n$result" } - # Wait for R 3 to report that it is an empty replica (cluster-allow-replica-migration no) + # Wait for R 3 to report that it is an empty primary (cluster-allow-replica-migration no) wait_for_log_messages -3 {"*I am now an empty primary*"} 0 1000 50 if {$type == "setslot"} { From 20f5a661f7cd581224e5b5c0e3d45da90baef5f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Wed, 25 Sep 2024 09:55:53 +0200 Subject: [PATCH 6/9] Fix bug for CLUSTER SLOTS from EVAL over TLS (#1072) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For fake clients like the ones used for Lua and modules, we don't determine TLS in the right way, causing CLUSTER SLOTS from EVAL over TLS to fail a debug-assert. This error was introduced when the caching of CLUSTER SLOTS was introduced, i.e. in 8.0.0. Signed-off-by: Viktor Söderqvist --- src/cluster.c | 2 +- tests/unit/cluster/cluster-response-tls.tcl | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 9b9e1f04eb..0b93821a53 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1428,7 +1428,7 @@ void clusterCommandSlots(client *c) { * ... continued until done */ int conn_type = 0; - if (connIsTLS(c->conn)) conn_type |= CACHE_CONN_TYPE_TLS; + if (shouldReturnTlsInfo()) conn_type |= CACHE_CONN_TYPE_TLS; if (isClientConnIpV6(c)) conn_type |= CACHE_CONN_TYPE_IPv6; if (c->resp == 3) conn_type |= CACHE_CONN_TYPE_RESP3; diff --git a/tests/unit/cluster/cluster-response-tls.tcl b/tests/unit/cluster/cluster-response-tls.tcl index 9b4e165e99..20670b748c 100644 --- a/tests/unit/cluster/cluster-response-tls.tcl +++ b/tests/unit/cluster/cluster-response-tls.tcl @@ -24,6 +24,15 @@ proc get_port_from_node_info {line} { proc cluster_response_tls {tls_cluster} { + test "CLUSTER SLOTS cached using EVAL over TLS -- tls-cluster $tls_cluster" { + set client_tcp [valkey 127.0.0.1 [srv 0 pport] 0 0] + set client_tls [valkey 127.0.0.1 [srv 0 port] 0 1] + set slots1 [$client_tls EVAL {return server.call('CLUSTER', 'SLOTS')} 0] + set slots2 [$client_tcp CLUSTER SLOTS] + # Compare the ports in the first row + assert_no_match [lindex $slots1 0 2 1] [lindex $slots2 0 2 1] + } + test "CLUSTER SLOTS with different connection type -- tls-cluster $tls_cluster" { set slots1 [R 0 cluster slots] set pport [srv 0 pport] From 8c19df99b52b0bc5d865b8b6cd1f6e2ebcb96c95 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Wed, 25 Sep 2024 17:30:45 +0800 Subject: [PATCH 7/9] Fix RDMA build dependence (#1074) RDMA module has dependence on '$(SERVER_NAME)' rather than the old style '$(REDIS_SERVER_NAME)'. Signed-off-by: zhenwei pi --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 13fa1c0275..020b70d6d5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -513,7 +513,7 @@ $(TLS_MODULE_NAME): $(SERVER_NAME) $(QUIET_CC)$(CC) -o $@ tls.c -shared -fPIC $(TLS_MODULE_CFLAGS) $(TLS_CLIENT_LIBS) # valkey-rdma.so -$(RDMA_MODULE_NAME): $(REDIS_SERVER_NAME) +$(RDMA_MODULE_NAME): $(SERVER_NAME) $(QUIET_CC)$(CC) -o $@ rdma.c -shared -fPIC $(RDMA_MODULE_CFLAGS) # valkey-cli From e0824e3996401c64127338009290db75b035693a Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 30 Sep 2024 16:42:08 -0700 Subject: [PATCH 8/9] Initial staging for 8.0.1 security release Signed-off-by: Madelyn Olson --- 00-RELEASENOTES | 17 +++++++++++++++++ src/server.h | 41 +++++++++++++++++++++-------------------- src/version.h | 4 ++-- 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/00-RELEASENOTES b/00-RELEASENOTES index df45398bd0..e71d67f50f 100644 --- a/00-RELEASENOTES +++ b/00-RELEASENOTES @@ -9,6 +9,23 @@ CRITICAL: There is a critical bug affecting MOST USERS. Upgrade ASAP. SECURITY: There are security fixes in the release. -------------------------------------------------------------------------------- +================================================================================ +Valkey 8.0.1 - Released Tue 1 Oct 2024 +================================================================================ + +Upgrade urgency SECURITY: This release includes security fixes we recommend you +apply as soon as possible. + +Bug fixes +========= +* Fix a build issue with RDMA when using additional make parameters. (#1074) +* Fix an issue where `CLUSTER SLOTS` might return the wrong tcp or tls port when called + from inside a script or from a module. (#1072) +* Fix a crash when `CLUSTER SLOTS` or `CLUSTER SHARDS` is called from inside + a script or from a module. (#1063) +* Fix a build issue on systems where `` is unavailable. (#1053) +* Fix an issue with the default `sentinel.conf` being invalid. (#1040) + ================================================================================ Valkey 8.0.0 GA - Released Sun 15 Sep 2024 ================================================================================ diff --git a/src/server.h b/src/server.h index a7cf8635a0..36668ae31e 100644 --- a/src/server.h +++ b/src/server.h @@ -1223,26 +1223,27 @@ typedef struct ClientFlags { uint64_t reprocessing_command : 1; /* The client is re-processing the command. */ uint64_t replication_done : 1; /* Indicate that replication has been done on the client */ uint64_t authenticated : 1; /* Indicate a client has successfully authenticated */ - uint64_t protected_rdb_channel : 1; /* Dual channel replication sync: Protects the RDB client from premature \ - * release during full sync. This flag is used to ensure that the RDB client, which \ - * references the first replication data block required by the replica, is not \ - * released prematurely. Protecting the client is crucial for prevention of \ - * synchronization failures: \ - * If the RDB client is released before the replica initiates PSYNC, the primary \ - * will reduce the reference count (o->refcount) of the block needed by the replica. - * \ - * This could potentially lead to the removal of the required data block, resulting \ - * in synchronization failures. Such failures could occur even in scenarios where \ - * the replica only needs an additional 4KB beyond the minimum size of the - * repl_backlog. - * By using this flag, we ensure that the RDB client remains intact until the replica - * \ has successfully initiated PSYNC. */ - uint64_t repl_rdb_channel : 1; /* Dual channel replication sync: track a connection which is used for rdb snapshot */ - uint64_t dont_cache_primary : 1; /* In some cases we don't want to cache the primary. For example, the replica - * knows that it does not need the cache and required a full sync. With this - * flag, we won't cache the primary in freeClient. */ - uint64_t fake : 1; /* This is a fake client without a real connection. */ - uint64_t reserved : 5; /* Reserved for future use */ + uint64_t + protected_rdb_channel : 1; /* Dual channel replication sync: Protects the RDB client from premature \ + * release during full sync. This flag is used to ensure that the RDB client, which \ + * references the first replication data block required by the replica, is not \ + * released prematurely. Protecting the client is crucial for prevention of \ + * synchronization failures: \ + * If the RDB client is released before the replica initiates PSYNC, the primary \ + * will reduce the reference count (o->refcount) of the block needed by the replica. + * \ + * This could potentially lead to the removal of the required data block, resulting \ + * in synchronization failures. Such failures could occur even in scenarios where \ + * the replica only needs an additional 4KB beyond the minimum size of the + * repl_backlog. + * By using this flag, we ensure that the RDB client remains intact until the replica + * \ has successfully initiated PSYNC. */ + uint64_t repl_rdb_channel : 1; /* Dual channel replication sync: track a connection which is used for rdb snapshot */ + uint64_t dont_cache_primary : 1; /* In some cases we don't want to cache the primary. For example, the replica + * knows that it does not need the cache and required a full sync. With this + * flag, we won't cache the primary in freeClient. */ + uint64_t fake : 1; /* This is a fake client without a real connection. */ + uint64_t reserved : 5; /* Reserved for future use */ } ClientFlags; typedef struct client { diff --git a/src/version.h b/src/version.h index 3005df6b81..eccb47465a 100644 --- a/src/version.h +++ b/src/version.h @@ -4,8 +4,8 @@ * similar. */ #define SERVER_NAME "valkey" #define SERVER_TITLE "Valkey" -#define VALKEY_VERSION "8.0.0" -#define VALKEY_VERSION_NUM 0x00080000 +#define VALKEY_VERSION "8.0.1" +#define VALKEY_VERSION_NUM 0x00080001 /* Redis OSS compatibility version, should never * exceed 7.2.x. */ From db9e1ad1373e4fafc109c972b534b11ddd481a49 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 20 Sep 2024 14:25:05 +0800 Subject: [PATCH 9/9] Fix timing issue in the new tot-net-out replica test (#1060) Apparently there is a timing issue when using wait_for_ofs_sync: ``` [exception]: Executing test client: can't read "out_before": no such variable. can't read "out_before": no such variable ``` The reason is that if the connection between the primary and the replica is not established yet, the master_repl_offset of the primary and replica in wait_for_ofs_sync is 0, and the check fails, resulting in no replica client in the client list below. In this case, we need to make sure the replica is online before proceeding. Signed-off-by: Binbin --- tests/unit/introspection.tcl | 60 +++++++++++++++++------------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index d7253f3750..286b02b7d0 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -25,7 +25,7 @@ start_server {tags {"introspection"}} { set kv [split $item "="] set k [lindex $kv 0] if {[string match $field $k]} { - return [lindex $kv 1] + return [lindex $kv 1] } } return "" @@ -332,7 +332,7 @@ start_server {tags {"introspection"}} { r migrate [srv 0 host] [srv 0 port] key 9 5000 AUTH2 user password catch {r auth not-real} _ catch {r auth not-real not-a-password} _ - + assert_match {*"key"*"9"*"5000"*} [$rd read] assert_match {*"key"*"9"*"5000"*"(redacted)"*} [$rd read] assert_match {*"key"*"9"*"5000"*"(redacted)"*"(redacted)"*} [$rd read] @@ -375,46 +375,45 @@ start_server {tags {"introspection"}} { $rd close } - + test {MONITOR log blocked command only once} { - # need to reconnect in order to reset the clients state reconnect - + set rd [valkey_deferring_client] set bc [valkey_deferring_client] r del mylist - + $rd monitor $rd read ; # Discard the OK - + $bc blpop mylist 0 wait_for_blocked_clients_count 1 r lpush mylist 1 wait_for_blocked_clients_count 0 r lpush mylist 2 - + # we expect to see the blpop on the monitor first assert_match {*"blpop"*"mylist"*"0"*} [$rd read] - + # we scan out all the info commands on the monitor set monitor_output [$rd read] while { [string match {*"info"*} $monitor_output] } { set monitor_output [$rd read] } - + # we expect to locate the lpush right when the client was unblocked assert_match {*"lpush"*"mylist"*"1"*} $monitor_output - + # we scan out all the info commands set monitor_output [$rd read] while { [string match {*"info"*} $monitor_output] } { set monitor_output [$rd read] } - + # we expect to see the next lpush and not duplicate blpop command assert_match {*"lpush"*"mylist"*"2"*} $monitor_output - + $rd close $bc close } @@ -655,7 +654,7 @@ start_server {tags {"introspection"}} { assert_equal [r config get save] {save {}} } } {} {external:skip} - + test {CONFIG SET with multiple args} { set some_configs {maxmemory 10000001 repl-backlog-size 10000002 save {3000 5}} @@ -677,7 +676,7 @@ start_server {tags {"introspection"}} { test {CONFIG SET rollback on set error} { # This test passes an invalid percent value to maxmemory-clients which should cause an - # input verification failure during the "set" phase before trying to apply the + # input verification failure during the "set" phase before trying to apply the # configuration. We want to make sure the correct failure happens and everything # is rolled back. # backup maxmemory config @@ -700,7 +699,7 @@ start_server {tags {"introspection"}} { test {CONFIG SET rollback on apply error} { # This test tries to configure a used port number in the server. This is expected - # to pass the `CONFIG SET` validity checking implementation but fail on + # to pass the `CONFIG SET` validity checking implementation but fail on # actual "apply" of the setting. This will validate that after an "apply" # failure we rollback to the previous values. proc dummy_accept {chan addr port} {} @@ -733,7 +732,7 @@ start_server {tags {"introspection"}} { dict set some_configs port $used_port # Run a dummy server on used_port so we know we can't configure the server to - # use it. It's ok for this to fail because that means used_port is invalid + # use it. It's ok for this to fail because that means used_port is invalid # anyway catch {socket -server dummy_accept -myaddr 127.0.0.1 $used_port} e if {$::verbose} { puts "dummy_accept: $e" } @@ -779,18 +778,18 @@ start_server {tags {"introspection"}} { test {CONFIG GET multiple args} { set res [r config get maxmemory maxmemory* bind *of] - + # Verify there are no duplicates in the result assert_equal [expr [llength [dict keys $res]]*2] [llength $res] - + # Verify we got both name and alias in result - assert {[dict exists $res slaveof] && [dict exists $res replicaof]} + assert {[dict exists $res slaveof] && [dict exists $res replicaof]} # Verify pattern found multiple maxmemory* configs - assert {[dict exists $res maxmemory] && [dict exists $res maxmemory-samples] && [dict exists $res maxmemory-clients]} + assert {[dict exists $res maxmemory] && [dict exists $res maxmemory-samples] && [dict exists $res maxmemory-clients]} # Verify we also got the explicit config - assert {[dict exists $res bind]} + assert {[dict exists $res bind]} } test {valkey-server command line arguments - error cases} { @@ -845,22 +844,19 @@ start_server {tags {"introspection"}} { set primary_pid [srv -1 pid] set replica [srv 0 client] set replica_pid [srv 0 pid] - + $replica replicaof $primary_host $primary_port - + # Wait for replica to be connected before proceeding. - wait_for_ofs_sync $primary $replica - + wait_replica_online $primary + # Avoid PINGs to make sure tot-net-out is stable. $primary config set repl-ping-replica-period 3600 # Increase repl timeout to avoid replica disconnecting $primary config set repl-timeout 3600 $replica config set repl-timeout 3600 - - # Wait for the replica to receive the command. - wait_for_ofs_sync $primary $replica - + # Get the tot-net-out of the replica before sending the command. set info_list [$primary client list] foreach info [split $info_list "\r\n"] { @@ -869,11 +865,11 @@ start_server {tags {"introspection"}} { break } } - + # Send a command to the primary. set value_size 10000 $primary set foo [string repeat "a" $value_size] - + # Get the tot-net-out of the replica after sending the command. set info_list [$primary client list] foreach info [split $info_list "\r\n"] {