From 91cf42ddf9c05c1b92a590e8a0d0214969df6347 Mon Sep 17 00:00:00 2001 From: kronwerk Date: Tue, 10 Dec 2024 10:57:35 +0300 Subject: [PATCH 001/101] added aof-max-size parameter with tests; fixes #540 Signed-off-by: kronwerk improved aof-max-size tests Signed-off-by: kronwerk --- src/aof.c | 16 +++++++--- src/config.c | 1 + src/server.c | 15 +++++++++ src/server.h | 1 + tests/unit/aof-max-size.tcl | 61 +++++++++++++++++++++++++++++++++++++ valkey.conf | 3 ++ 6 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 tests/unit/aof-max-size.tcl diff --git a/src/aof.c b/src/aof.c index 0fd3cf5c26..8af3a9928f 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1010,16 +1010,22 @@ int startAppendOnly(void) { * the first call is short, there is a end-of-space condition, so the next * is likely to fail. However apparently in modern systems this is no longer * true, and in general it looks just more resilient to retry the write. If - * there is an actual error condition we'll get it at the next try. */ -ssize_t aofWrite(int fd, const char *buf, size_t len) { - ssize_t nwritten = 0, totwritten = 0; + * there is an actual error condition we'll get it at the next try. + * We also check for aof-max-size limit here returning "no space" on exceed. */ +ssize_t aofWrite(int fd, const char *buf, size_t len, off_t aof_current_size, unsigned long long aof_max_size) { + ssize_t nwritten = 0, totwritten = 0, nonewritten = -1; + + if (aof_max_size && (unsigned long long)aof_current_size >= aof_max_size) { + errno = ENOSPC; + return nonewritten; + } while (len) { nwritten = write(fd, buf, len); if (nwritten < 0) { if (errno == EINTR) continue; - return totwritten ? totwritten : -1; + return totwritten ? totwritten : nonewritten; } len -= nwritten; @@ -1119,7 +1125,7 @@ void flushAppendOnlyFile(int force) { } latencyStartMonitor(latency); - nwritten = aofWrite(server.aof_fd, server.aof_buf, sdslen(server.aof_buf)); + nwritten = aofWrite(server.aof_fd, server.aof_buf, sdslen(server.aof_buf), server.aof_current_size, server.aof_max_size); latencyEndMonitor(latency); /* We want to capture different events for delayed writes: * when the delay happens with a pending fsync, or with a saving child diff --git a/src/config.c b/src/config.c index cc0f8d2dd8..bcfa465e1f 100644 --- a/src/config.c +++ b/src/config.c @@ -3337,6 +3337,7 @@ standardConfig static_configs[] = { /* Unsigned Long Long configs */ createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory), createULongLongConfig("cluster-link-sendbuf-limit", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.cluster_link_msg_queue_limit_bytes, 0, MEMORY_CONFIG, NULL, NULL), + createULongLongConfig("aof-max-size", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.aof_max_size, 0, INTEGER_CONFIG, NULL, NULL), /* Size_t configs */ createSizeTConfig("hash-max-listpack-entries", "hash-max-ziplist-entries", MODIFIABLE_CONFIG, 0, LONG_MAX, server.hash_max_listpack_entries, 512, INTEGER_CONFIG, NULL, NULL), diff --git a/src/server.c b/src/server.c index 1e38b5ac69..518ecad603 100644 --- a/src/server.c +++ b/src/server.c @@ -5800,10 +5800,17 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "module_fork_last_cow_size:%zu\r\n", server.stat_module_cow_bytes)); if (server.aof_enabled) { + char aof_current_size_hdsk[64]; + char aof_max_size_hdsk[64]; + bytesToHuman(aof_current_size_hdsk, sizeof(aof_current_size_hdsk), (unsigned long long)server.aof_current_size); + bytesToHuman(aof_max_size_hdsk, sizeof(aof_max_size_hdsk), server.aof_max_size); info = sdscatprintf( info, FMTARGS( "aof_current_size:%lld\r\n", (long long)server.aof_current_size, + "aof_current_size_human:%s\r\n", aof_current_size_hdsk, + "aof_max_size:%lld\r\n", server.aof_max_size, + "aof_max_size_human:%s\r\n", aof_max_size_hdsk, "aof_base_size:%lld\r\n", (long long)server.aof_rewrite_base_size, "aof_pending_rewrite:%d\r\n", server.aof_rewrite_scheduled, "aof_buffer_length:%zu\r\n", sdslen(server.aof_buf), @@ -7130,6 +7137,14 @@ __attribute__((weak)) int main(int argc, char **argv) { server.maxmemory); } + /* Warning the user about suspicious aof-max-size setting. */ + if (server.aof_max_size > 0 && server.aof_max_size < 1024 * 1024) { + serverLog(LL_WARNING, + "WARNING: You specified a aof-max-size value that is less than 1MB (current value is %llu bytes). Are " + "you sure this is what you really want?", + server.aof_max_size); + } + serverSetCpuAffinity(server.server_cpulist); setOOMScoreAdj(-1); diff --git a/src/server.h b/src/server.h index 14a16593b0..3ba7a61b7d 100644 --- a/src/server.h +++ b/src/server.h @@ -1939,6 +1939,7 @@ struct valkeyServer { off_t aof_rewrite_min_size; /* the AOF file is at least N bytes. */ off_t aof_rewrite_base_size; /* AOF size on latest startup or rewrite. */ off_t aof_current_size; /* AOF current size (Including BASE + INCRs). */ + unsigned long long aof_max_size; /* Max number of disk bytes to use for AOF */ off_t aof_last_incr_size; /* The size of the latest incr AOF. */ off_t aof_last_incr_fsync_offset; /* AOF offset which is already requested to be synced to disk. * Compare with the aof_last_incr_size. */ diff --git a/tests/unit/aof-max-size.tcl b/tests/unit/aof-max-size.tcl new file mode 100644 index 0000000000..4c35220a77 --- /dev/null +++ b/tests/unit/aof-max-size.tcl @@ -0,0 +1,61 @@ +start_server {tags {"aof-max-size" "external:skip"}} { + r config set auto-aof-rewrite-percentage 0 ; # disable auto-rewrite + r config set appendonly yes ; # enable AOF + + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + test "Low aof-max-size stops writing AOF with ENOSPC" { + r set k v + r config set aof-max-size 1 + + r set k2 v2 + wait_for_log_messages 0 {"*Error writing to the AOF file: No space left on device*"} 0 100 10 + } + + test "New write attempts fail and doesn't insrease AOF buffer anymore" { + set info1 [r info] + set buf1 [getInfoProperty $info1 mem_aof_buffer] + set len1 [getInfoProperty $info1 aof_buffer_length] + + catch {r set somelongerkey somelongervalue} err + assert {$err eq "MISCONF Errors writing to the AOF file: No space left on device"} + assert_equal [r get somelongerkey] "" + + set info2 [r info] + set buf2 [getInfoProperty $info2 mem_aof_buffer] + set len2 [getInfoProperty $info2 aof_buffer_length] + assert_equal $buf1 $buf2 + assert_equal $len1 $len2 + } + + test "Increasing aof-max-size fixes AOF write error" { + r config set aof-max-size 1000 + wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} 0 100 10 + + assert_equal [r set k3 v3] "OK" + assert_equal [r get k3] "v3" + } + + test "Meeting aof-max-size does not prevent AOF rewrite" { + set loglines [count_log_lines 0] ; # want to check new line, not from previous test + + # start write load + set load_handle0 [start_write_load $master_host $master_port 10] + wait_for_condition 50 100 { + [r dbsize] > 0 + } else { + fail "No write load detected." + } + + waitForBgrewriteaof r + r bgrewriteaof + wait_for_log_messages 0 {"*Background AOF rewrite finished successfully*"} $loglines 100 10 + wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} $loglines 100 10 + + # stop write load + stop_write_load $load_handle0 + wait_load_handlers_disconnected + } +} \ No newline at end of file diff --git a/valkey.conf b/valkey.conf index e23aea39de..8ea5273045 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1653,6 +1653,9 @@ aof-use-rdb-preamble yes # the AOF format in a way that may not be compatible with existing AOF parsers. aof-timestamp-enabled no +# Maximum size for AOF files on disk in bytes. Ignored, if set to 0. +aof-max-size 0 + ################################ SHUTDOWN ##################################### # Maximum time to wait for replicas when shutting down, in seconds. From b57409c7b44f31ce230af4f1020aaa46f84f4909 Mon Sep 17 00:00:00 2001 From: kronwerk Date: Wed, 11 Dec 2024 17:46:46 +0300 Subject: [PATCH 002/101] tuned tests Signed-off-by: kronwerk --- tests/unit/aof-max-size.tcl | 45 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/tests/unit/aof-max-size.tcl b/tests/unit/aof-max-size.tcl index 4c35220a77..e5526d819f 100644 --- a/tests/unit/aof-max-size.tcl +++ b/tests/unit/aof-max-size.tcl @@ -1,25 +1,34 @@ -start_server {tags {"aof-max-size" "external:skip"}} { +proc setup {{size 1}} { + r set k v + r config set aof-max-size $size + r set k2 v2 +} + +proc cleanup {} { + r config set aof-max-size 0 + r flushall +} + +start_server {tags {"external:skip"}} { r config set auto-aof-rewrite-percentage 0 ; # disable auto-rewrite r config set appendonly yes ; # enable AOF - set master [srv 0 client] set master_host [srv 0 host] set master_port [srv 0 port] test "Low aof-max-size stops writing AOF with ENOSPC" { - r set k v - r config set aof-max-size 1 - - r set k2 v2 + setup wait_for_log_messages 0 {"*Error writing to the AOF file: No space left on device*"} 0 100 10 + cleanup } - test "New write attempts fail and doesn't insrease AOF buffer anymore" { + test "New write attempts when limited with aof-max-size fail and doesn't insrease AOF buffer anymore" { + setup set info1 [r info] set buf1 [getInfoProperty $info1 mem_aof_buffer] set len1 [getInfoProperty $info1 aof_buffer_length] - catch {r set somelongerkey somelongervalue} err + catch {r set somelongerkey somelongrvalue} err assert {$err eq "MISCONF Errors writing to the AOF file: No space left on device"} assert_equal [r get somelongerkey] "" @@ -28,34 +37,28 @@ start_server {tags {"aof-max-size" "external:skip"}} { set len2 [getInfoProperty $info2 aof_buffer_length] assert_equal $buf1 $buf2 assert_equal $len1 $len2 + cleanup } test "Increasing aof-max-size fixes AOF write error" { + setup + set loglines [count_log_lines 0] ; # want to check new line, not from previous test r config set aof-max-size 1000 - wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} 0 100 10 + wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} $loglines 100 10 assert_equal [r set k3 v3] "OK" assert_equal [r get k3] "v3" + cleanup } test "Meeting aof-max-size does not prevent AOF rewrite" { + setup 200 set loglines [count_log_lines 0] ; # want to check new line, not from previous test - - # start write load - set load_handle0 [start_write_load $master_host $master_port 10] - wait_for_condition 50 100 { - [r dbsize] > 0 - } else { - fail "No write load detected." - } waitForBgrewriteaof r r bgrewriteaof wait_for_log_messages 0 {"*Background AOF rewrite finished successfully*"} $loglines 100 10 wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} $loglines 100 10 - - # stop write load - stop_write_load $load_handle0 - wait_load_handlers_disconnected + cleanup } } \ No newline at end of file From 089015b50a2f6a5578f59905560f22b0dcd09eb7 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Wed, 11 Dec 2024 09:47:06 -0800 Subject: [PATCH 003/101] defrag: allow defrag to start during AOF loading (#1420) Addresses https://github.com/valkey-io/valkey/issues/1393 Changes: * During AOF loading or long running script, this allows defrag to be initiated. * The AOF defrag test was corrected to eliminate the wait period and rely on non-timer invocations. * Logic for "overage" time in defrag was changed. It previously accumulated underage leading to large latencies in extreme tests having very high CPU percentage. After several simple stages were completed during infrequent blocked processing, a large cycle time would be experienced. Signed-off-by: Jim Brunner --- src/defrag.c | 14 ++++++++++---- tests/unit/memefficiency.tcl | 6 +++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 057fdd50de..2fa067f0dc 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -84,7 +84,7 @@ struct DefragContext { long long timeproc_id; // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) monotime timeproc_end_time; // Ending time of previous timerproc execution - long timeproc_overage_us; // A correction value if over/under target CPU percent + long timeproc_overage_us; // A correction value if over target CPU percent }; static struct DefragContext defrag; @@ -1157,7 +1157,7 @@ static int computeDefragCycleUs(void) { * the starvation of the timer. */ dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent); - // Also adjust for any accumulated overage(underage). + // Also adjust for any accumulated overage. dutyCycleUs -= defrag.timeproc_overage_us; defrag.timeproc_overage_us = 0; @@ -1176,8 +1176,11 @@ static int computeDefragCycleUs(void) { * computeDefragCycleUs computation. */ static int computeDelayMs(monotime intendedEndtime) { defrag.timeproc_end_time = getMonotonicUs(); - int overage = defrag.timeproc_end_time - intendedEndtime; + long overage = defrag.timeproc_end_time - intendedEndtime; defrag.timeproc_overage_us += overage; // track over/under desired CPU + /* Allow negative overage (underage) to count against existing overage, but don't allow + * underage (from short stages) to be accumulated. */ + if (defrag.timeproc_overage_us < 0) defrag.timeproc_overage_us = 0; int targetCpuPercent = server.active_defrag_cpu_percent; serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100); @@ -1189,7 +1192,7 @@ static int computeDelayMs(monotime intendedEndtime) { long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent; long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us; // Only increase delay by the fraction of the overage that would be non-duty-cycle - delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; // "overage" might be negative + delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; if (delayUs < 0) delayUs = 0; long delayMs = delayUs / 1000; // round down return delayMs; @@ -1254,6 +1257,9 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i * actions. This interface allows defrag to continue running, avoiding a single long defrag step * after the long operation completes. */ void defragWhileBlocked(void) { + // This is called infrequently, while timers are not active. We might need to start defrag. + if (!defragIsRunning()) monitorActiveDefrag(); + if (!defragIsRunning()) return; // Save off the timeproc_id. If we have a normal termination, it will be cleared. diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index abd23b1d83..ce74b7c618 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -138,8 +138,12 @@ run_solo {defrag} { # reset stats and load the AOF file r config resetstat r config set key-load-delay -25 ;# sleep on average 1/25 usec + # Note: This test is checking if defrag is working DURING AOF loading (while + # timers are not active). So we don't give any extra time, and we deactivate + # defrag immediately after the AOF loading is complete. During loading, + # defrag will get invoked less often, causing starvation prevention. We + # should expect longer latency measurements. r debug loadaof - after 1000 ;# give defrag a chance to work before turning it off r config set activedefrag no # measure hits and misses right after aof loading From 2a2900fd3ff4b7ac11a70772a5dad11eedf75aa0 Mon Sep 17 00:00:00 2001 From: Pierre <105686771+pieturin@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:26:06 -0800 Subject: [PATCH 004/101] Send MEET packet to node if there is no inbound link to fix inconsistency when handshake timedout (#1307) In some cases, when meeting a new node, if the handshake times out, we can end up with an inconsistent view of the cluster where the new node knows about all the nodes in the cluster, but the cluster does not know about this new node (or vice versa). To detect this inconsistency, we now check if a node has an outbound link but no inbound link, in this case it probably means this node does not know us. In this case we (re-)send a MEET packet to this node to do a new handshake with it. If we receive a MEET packet from a known node, we disconnect the outbound link to force a reconnect and sending of a PING packet so that the other node recognizes the link as belonging to us. This prevents cases where a node could send MEET packets in a loop because it thinks the other node does not have an inbound link. This fixes the bug described in #1251. --------- Signed-off-by: Pierre Turin --- src/cluster_legacy.c | 99 ++++++--- src/cluster_legacy.h | 4 + tests/support/cluster_util.tcl | 9 + tests/unit/cluster/cluster-multiple-meets.tcl | 4 +- tests/unit/cluster/cluster-reliable-meet.tcl | 208 +++++++++++++++++- 5 files changed, 291 insertions(+), 33 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index a273fe0d86..d1c6dd0094 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1336,6 +1336,10 @@ clusterLink *createClusterLink(clusterNode *node) { * with this link will have the 'link' field set to NULL. */ void freeClusterLink(clusterLink *link) { serverAssert(link != NULL); + serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s", + link->node ? link->node->name : "", + link->inbound ? "inbound" : "outbound"); + if (link->conn) { connClose(link->conn); link->conn = NULL; @@ -1351,6 +1355,7 @@ void freeClusterLink(clusterLink *link) { } else if (link->node->inbound_link == link) { serverAssert(link->inbound); link->node->inbound_link = NULL; + link->node->inbound_link_freed_time = mstime(); } } zfree(link); @@ -1490,6 +1495,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { node->fail_time = 0; node->link = NULL; node->inbound_link = NULL; + node->inbound_link_freed_time = node->ctime; memset(node->ip, 0, sizeof(node->ip)); node->announce_client_ipv4 = sdsempty(); node->announce_client_ipv6 = sdsempty(); @@ -1696,6 +1702,9 @@ void clusterAddNode(clusterNode *node) { * it is a replica node. */ void clusterDelNode(clusterNode *delnode) { + serverAssert(delnode != NULL); + serverLog(LL_DEBUG, "Deleting node %.40s from cluster view", delnode->name); + int j; dictIterator *di; dictEntry *de; @@ -2078,7 +2087,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { /* Return 1 if we already have a node in HANDSHAKE state matching the * specified ip address and port number. This function is used in order to * avoid adding a new handshake node for the same address multiple times. */ -int clusterHandshakeInProgress(char *ip, int port, int cport) { +static int clusterHandshakeInProgress(char *ip, int port, int cport) { dictIterator *di; dictEntry *de; @@ -2100,7 +2109,7 @@ int clusterHandshakeInProgress(char *ip, int port, int cport) { * * EAGAIN - There is already a handshake in progress for this address. * EINVAL - IP or port are not valid. */ -int clusterStartHandshake(char *ip, int port, int cport) { +static int clusterStartHandshake(char *ip, int port, int cport) { clusterNode *n; char norm_ip[NET_IP_STR_LEN]; struct sockaddr_storage sa; @@ -3207,33 +3216,48 @@ int clusterProcessPacket(clusterLink *link) { } } - /* Add this node if it is new for us and the msg type is MEET. - * In this stage we don't try to add the node with the right - * flags, replicaof pointer, and so forth, as this details will be - * resolved when we'll receive PONGs from the node. The exception - * to this is the flag that indicates extensions are supported, as - * we want to send extensions right away in the return PONG in order - * to reduce the amount of time needed to stabilize the shard ID. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) { - clusterNode *node; - - node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); - serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK); - getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); - node->cport = ntohs(hdr->cport); - if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { - node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED; + if (type == CLUSTERMSG_TYPE_MEET) { + if (!sender) { + /* Add this node if it is new for us and the msg type is MEET. + * In this stage we don't try to add the node with the right + * flags, replicaof pointer, and so forth, as this details will be + * resolved when we'll receive PONGs from the node. The exception + * to this is the flag that indicates extensions are supported, as + * we want to send extensions right away in the return PONG in order + * to reduce the amount of time needed to stabilize the shard ID. */ + clusterNode *node; + + node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); + serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK); + getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); + node->cport = ntohs(hdr->cport); + if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { + node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED; + } + setClusterNodeToInboundClusterLink(node, link); + clusterAddNode(node); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + + /* If this is a MEET packet from an unknown node, we still process + * the gossip section here since we have to trust the sender because + * of the message type. */ + clusterProcessGossipSection(hdr, link); + } else if (sender->link && now - sender->ctime > server.cluster_node_timeout) { + /* The MEET packet is from a known node, after the handshake timeout, so the sender thinks that I do not + * know it. + * Freeing my outbound link to that node, to force a reconnect and sending a PING. + * Once that node receives our PING, it should recognize the new connection as an inbound link from me. + * We should only free the outbound link if the node is known for more time than the handshake timeout, + * since during this time, the other side might still be trying to complete the handshake. */ + + /* We should always receive a MEET packet on an inbound link. */ + serverAssert(link != sender->link); + serverLog(LL_NOTICE, "Freeing outbound link to node %.40s after receiving a MEET packet from this known node", + sender->name); + freeClusterLink(sender->link); } - setClusterNodeToInboundClusterLink(node, link); - clusterAddNode(node); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } - /* If this is a MEET packet from an unknown node, we still process - * the gossip section here since we have to trust the sender because - * of the message type. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) clusterProcessGossipSection(hdr, link); - /* Anyway reply with a PONG */ clusterSendPing(link, CLUSTERMSG_TYPE_PONG); } @@ -3243,7 +3267,7 @@ int clusterProcessPacket(clusterLink *link) { serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type), link->node ? link->node->name : "NULL"); - if (sender && (sender->flags & CLUSTER_NODE_MEET)) { + if (sender && nodeInMeetState(sender)) { /* Once we get a response for MEET from the sender, we can stop sending more MEET. */ sender->flags &= ~CLUSTER_NODE_MEET; serverLog(LL_NOTICE, "Successfully completed handshake with %.40s (%s)", sender->name, @@ -3668,7 +3692,7 @@ void clusterLinkConnectHandler(connection *conn) { * of a PING one, to force the receiver to add us in its node * table. */ mstime_t old_ping_sent = node->ping_sent; - clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); + clusterSendPing(link, nodeInMeetState(node) ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); if (old_ping_sent) { /* If there was an active ping before the link was * disconnected, we want to restore the ping time, otherwise @@ -3747,7 +3771,9 @@ void clusterReadHandler(connection *conn) { if (nread <= 0) { /* I/O error... */ - serverLog(LL_DEBUG, "I/O error reading from node link: %s", + serverLog(LL_DEBUG, "I/O error reading from node link (%.40s:%s): %s", + link->node ? link->node->name : "", + link->inbound ? "inbound" : "outbound", (nread == 0) ? "connection closed" : connGetLastError(conn)); handleLinkIOError(link); return; @@ -3928,6 +3954,12 @@ void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) { /* Send a PING or PONG packet to the specified node, making sure to add enough * gossip information. */ void clusterSendPing(clusterLink *link, int type) { + serverLog(LL_DEBUG, "Sending %s packet to node %.40s (%s) on %s link", + clusterGetMessageTypeString(type), + link->node ? link->node->name : "", + link->node ? link->node->human_nodename : "", + link->inbound ? "inbound" : "outbound"); + static unsigned long long cluster_pings_sent = 0; cluster_pings_sent++; int gossipcount = 0; /* Number of gossip sections added so far. */ @@ -4943,6 +4975,15 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_ clusterDelNode(node); return 1; } + if (node->link != NULL && node->inbound_link == NULL && nodeInNormalState(node) && + now - node->inbound_link_freed_time > handshake_timeout) { + /* Node has an outbound link, but no inbound link for more than the handshake timeout. + * This probably means this node does not know us yet, whereas we know it. + * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view. */ + node->flags |= CLUSTER_NODE_MEET; + serverLog(LL_NOTICE, "Sending MEET packet to node %.40s because there is no inbound link for it", node->name); + clusterSendPing(node->link, CLUSTERMSG_TYPE_MEET); + } if (node->link == NULL) { clusterLink *link = createClusterLink(node); diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 5595402a4d..fb317038d6 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -61,12 +61,14 @@ typedef struct clusterLink { #define nodeIsPrimary(n) ((n)->flags & CLUSTER_NODE_PRIMARY) #define nodeIsReplica(n) ((n)->flags & CLUSTER_NODE_REPLICA) #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) +#define nodeInMeetState(n) ((n)->flags & CLUSTER_NODE_MEET) #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) #define nodeSupportsExtensions(n) ((n)->flags & CLUSTER_NODE_EXTENSIONS_SUPPORTED) #define nodeSupportsLightMsgHdr(n) ((n)->flags & CLUSTER_NODE_LIGHT_HDR_SUPPORTED) +#define nodeInNormalState(n) (!((n)->flags & (CLUSTER_NODE_HANDSHAKE | CLUSTER_NODE_MEET | CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) /* This structure represent elements of node->fail_reports. */ typedef struct clusterNodeFailReport { @@ -343,6 +345,8 @@ struct _clusterNode { * failover scenarios. */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned primary condition */ + mstime_t inbound_link_freed_time; /* Last time we freed the inbound link for this node. + If it was never freed, it is the same as ctime */ long long repl_offset; /* Last known repl offset for this node. */ char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ sds announce_client_ipv4; /* IPv4 for clients only. */ diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index 686f00071b..4f641c5e96 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -323,6 +323,15 @@ proc get_cluster_nodes {id {status "*"}} { return $nodes } +# Returns the parsed myself node entry as a dictionary. +proc get_myself id { + set nodes [get_cluster_nodes $id] + foreach n $nodes { + if {[cluster_has_flag $n myself]} {return $n} + } + return {} +} + # Returns 1 if no node knows node_id, 0 if any node knows it. proc node_is_forgotten {node_id} { for {set j 0} {$j < [llength $::servers]} {incr j} { diff --git a/tests/unit/cluster/cluster-multiple-meets.tcl b/tests/unit/cluster/cluster-multiple-meets.tcl index 059f03fbe4..0b5f769930 100644 --- a/tests/unit/cluster/cluster-multiple-meets.tcl +++ b/tests/unit/cluster/cluster-multiple-meets.tcl @@ -58,7 +58,7 @@ tags {tls:skip external:skip cluster} { } else { fail "Node 1 recognizes node 0 even though it drops PONGs from node 0" } - assert {[llength [get_cluster_nodes 0 connected]] == 2} + assert {[llength [get_cluster_nodes 0]] == 2} # Drop incoming and outgoing links from/to 1 R 0 DEBUG CLUSTERLINK KILL ALL [R 1 CLUSTER MYID] @@ -77,6 +77,8 @@ tags {tls:skip external:skip cluster} { # Both a and b will turn to cluster state ok wait_for_condition 1000 50 { [CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} && + [llength [get_cluster_nodes 0 connected]] == 2 && + [llength [get_cluster_nodes 1 connected]] == 2 && [CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received] } else { fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]" diff --git a/tests/unit/cluster/cluster-reliable-meet.tcl b/tests/unit/cluster/cluster-reliable-meet.tcl index 45f5a6dc89..f189e96d5b 100644 --- a/tests/unit/cluster/cluster-reliable-meet.tcl +++ b/tests/unit/cluster/cluster-reliable-meet.tcl @@ -3,6 +3,12 @@ set old_singledb $::singledb set ::singledb 1 tags {tls:skip external:skip cluster} { + set CLUSTER_PACKET_TYPE_PING 0 + set CLUSTER_PACKET_TYPE_PONG 1 + set CLUSTER_PACKET_TYPE_MEET 2 + set CLUSTER_PACKET_TYPE_NONE -1 + set CLUSTER_PACKET_TYPE_ALL -2 + set base_conf [list cluster-enabled yes] start_multiple_servers 2 [list overrides $base_conf] { test "Cluster nodes are reachable" { @@ -22,9 +28,6 @@ tags {tls:skip external:skip cluster} { wait_for_cluster_state fail } - set CLUSTER_PACKET_TYPE_MEET 2 - set CLUSTER_PACKET_TYPE_NONE -1 - test "Cluster nodes haven't met each other" { assert {[llength [get_cluster_nodes 1]] == 1} assert {[llength [get_cluster_nodes 0]] == 1} @@ -75,3 +78,202 @@ tags {tls:skip external:skip cluster} { set ::singledb $old_singledb +proc cluster_get_first_node_in_handshake id { + set nodes [get_cluster_nodes $id] + foreach n $nodes { + if {[cluster_has_flag $n handshake]} { + return [dict get $n id] + } + } + return {} +} + +proc cluster_nodes_all_know_each_other {num_nodes} { + # Collect node IDs dynamically + set node_ids {} + for {set i 0} {$i < $num_nodes} {incr i} { + lappend node_ids [dict get [get_myself $i] id] + } + + # Check if all nodes know each other + foreach node_id $node_ids { + foreach check_node_id $node_ids { + for {set node_index 0} {$node_index < $num_nodes} {incr node_index} { + if {[cluster_get_node_by_id $node_index $check_node_id] == {}} { + return 0 + } + } + } + } + + # Verify cluster link counts for each node + set expected_links [expr {2 * ($num_nodes - 1)}] + for {set i 0} {$i < $num_nodes} {incr i} { + if {[llength [R $i CLUSTER LINKS]] != $expected_links} { + return 0 + } + } + + return 1 +} + +start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} { + set CLUSTER_PACKET_TYPE_PING 0 + set CLUSTER_PACKET_TYPE_PONG 1 + set CLUSTER_PACKET_TYPE_MEET 2 + set CLUSTER_PACKET_TYPE_NONE -1 + set CLUSTER_PACKET_TYPE_ALL -2 + + test "Handshake eventually succeeds after node handshake timeout on both sides with inconsistent view of the cluster" { + set cluster_port [find_available_port $::baseport $::portcount] + start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] { + # In this test we will trigger a handshake timeout on both sides of the handshake. + # Node 1 and 2 already know each other, then we make node 1 meet node 0: + # + # Node 1 -- MEET -> Node 0 [Node 0 might learn about Node 2 from the gossip section of the msg] + # Node 1 <- PONG -- Node 0 [we drop this message, so Node 1 will eventually mark the handshake as timed out] + # Node 1 <- PING -- Node 0 [we drop this message, so Node 1 will never send a PONG and Node 0 will eventually mark the handshake as timed out] + # + # After the handshake is timed out, we allow all cluster bus messages to go through. + # Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake. + + set node0_id [dict get [get_myself 0] id] + set node1_id [dict get [get_myself 1] id] + set node2_id [dict get [get_myself 2] id] + + # Drop all cluster bus messages + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL + # Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2. + R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET + + R 1 CLUSTER MEET [srv 0 host] [srv 0 port] $cluster_port + + # Wait for Node 0 to be in handshake + wait_for_condition 10 400 { + [cluster_get_first_node_in_handshake 0] != {} + } else { + fail "Node 0 never entered handshake state" + } + + # We want Node 0 to learn about Node 2 through the gossip section of the MEET message + set meet_retry 0 + while {[cluster_get_node_by_id 0 $node2_id] eq {}} { + if {$meet_retry == 10} { + error "assertion: Retried to meet Node 0 too many times" + } + # If Node 0 doesn't know about Node 1 & 2, it means Node 1 did not gossip about node 2 in its MEET message. + # So we kill the outbound link from Node 1 to Node 0, to force a reconnect and a re-send of the MEET message. + after 100 + # Since we are in handshake, we use a randomly generated ID we have to find + R 1 DEBUG CLUSTERLINK KILL ALL [cluster_get_first_node_in_handshake 1] + incr meet_retry 1 + } + + # Wait for Node 1's handshake to timeout + wait_for_condition 50 100 { + [cluster_get_first_node_in_handshake 1] eq {} + } else { + fail "Node 1 never exited handshake state" + } + + # Wait for Node 0's handshake to timeout + wait_for_condition 50 100 { + [cluster_get_first_node_in_handshake 1] eq {} + } else { + fail "Node 0 never exited handshake state" + } + + # At this point Node 0 knows Node 1 & 2 through the gossip, but they don't know Node 0. + wait_for_condition 50 100 { + [cluster_get_node_by_id 0 $node1_id] != {} && + [cluster_get_node_by_id 0 $node2_id] != {} && + [cluster_get_node_by_id 1 $node0_id] eq {} && + [cluster_get_node_by_id 2 $node0_id] eq {} + } else { + fail "Unexpected CLUSTER NODES output, nodes 1 & 2 should not know node 0." + } + + # Allow all messages to go through again + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + + # Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inbound link. + # Handshake should now complete successfully. + wait_for_condition 50 200 { + [cluster_nodes_all_know_each_other 3] + } else { + fail "Unexpected CLUSTER NODES output, all nodes should know each other." + } + } ;# stop Node 0 + } ;# test +} ;# stop cluster + +start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} { + set CLUSTER_PACKET_TYPE_PING 0 + set CLUSTER_PACKET_TYPE_PONG 1 + set CLUSTER_PACKET_TYPE_MEET 2 + set CLUSTER_PACKET_TYPE_NONE -1 + set CLUSTER_PACKET_TYPE_ALL -2 + + test "Handshake eventually succeeds after node handshake timeout on one side with inconsistent view of the cluster" { + set cluster_port [find_available_port $::baseport $::portcount] + start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] { + # In this test we will trigger a handshake timeout on one side of the handshake. + # Node 1 and 2 already know each other, then we make node 0 meet node 1: + # + # Node 0 -- MEET -> Node 1 + # Node 0 <- PONG -- Node 1 + # Node 0 <- PING -- Node 1 [Node 0 will mark the handshake as successful] + # Node 0 -- PONG -> Node 1 [we drop this message, so node 1 will eventually mark the handshake as timed out] + # + # After the handshake is timed out, we allow all cluster bus messages to go through. + # Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake. + + set node0_id [dict get [get_myself 0] id] + set node1_id [dict get [get_myself 1] id] + set node2_id [dict get [get_myself 2] id] + + # Drop PONG messages + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_PONG + # Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2. + R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET + + # Node 0 meets node 1 + R 0 CLUSTER MEET [srv -1 host] [srv -1 port] + + # Wait for node 0 to know about the other nodes in the cluster + wait_for_condition 50 100 { + [cluster_get_node_by_id 0 $node1_id] != {} + } else { + fail "Node 0 never learned about node 1" + } + # At this point, node 0 knows about node 1 and might know node 2 if node 1 gossiped about it. + wait_for_condition 50 100 { + [cluster_get_first_node_in_handshake 0] eq {} + } else { + fail "Node 1 never exited handshake state" + } + # At this point, from node 0 point of view, the handshake with node 1 succeeded. + + wait_for_condition 50 100 { + [cluster_get_first_node_in_handshake 1] eq {} + } else { + fail "Node 1 never exited handshake state" + } + assert {[cluster_get_node_by_id 1 $node0_id] eq {}} + # At this point, from node 1 point of view, the handshake with node 0 timed out. + + # Allow all messages + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + + # Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inblound link. + # Handshake should now complete successfully. + wait_for_condition 50 200 { + [cluster_nodes_all_know_each_other 3] + } else { + fail "Unexpected CLUSTER NODES output, all nodes should know each other." + } + } ;# stop Node 0 + } ;# test +} ;# stop cluster From f5793d8bebbb8ccd34ad767b357168b083ff0481 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Thu, 12 Dec 2024 23:52:58 +0200 Subject: [PATCH 005/101] Avoid defragging scripts during EVAL command execution (#1414) This can happen when scripts are running for long period of time and the server attempts to defrag it in the whileBlockedCron. Signed-off-by: Ran Shidlansik --- src/defrag.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/defrag.c b/src/defrag.c index 2fa067f0dc..be7ff07510 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -34,6 +34,7 @@ */ #include "server.h" +#include "script.h" #include #ifdef HAVE_DEFRAG @@ -1050,6 +1051,9 @@ static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdat UNUSED(target); UNUSED(privdata); if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization + /* In case we are in the process of eval some script we do not want to replace the script being run + * so we just bail out without really defragging here. */ + if (scriptIsRunning()) return DEFRAG_DONE; activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); return DEFRAG_DONE; } From e407ced83434651ee4ea3e2f8885bc457269f1af Mon Sep 17 00:00:00 2001 From: Vu Diep <54611122+vudiep411@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:42:52 -0800 Subject: [PATCH 006/101] Use `configure-aws-credentials` workflow instead of passing `secret_access_key` (#1363) ## Summary This PR fixes #1346 where we can get rid of the long term credentials by using OpenID Connect. OpenID Connect (OIDC) allows your GitHub Actions workflows to access resources in Amazon Web Services (AWS), without needing to store the AWS credentials as long-lived GitHub secrets. --------- Signed-off-by: vudiep411 --- .github/workflows/build-release-packages.yml | 43 +++++++++++++------ .../call-build-linux-arm-packages.yml | 39 ++++++++--------- .../call-build-linux-x86-packages.yml | 39 ++++++++--------- 3 files changed, 65 insertions(+), 56 deletions(-) diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml index 094d82de08..44e012d658 100644 --- a/.github/workflows/build-release-packages.yml +++ b/.github/workflows/build-release-packages.yml @@ -3,7 +3,12 @@ name: Build Release Packages on: release: types: [published] - + push: + paths: + - '.github/workflows/build-release-packages.yml' + - '.github/workflows/call-build-linux-arm-packages.yml' + - '.github/workflows/call-build-linux-x86_64-packages.yml' + - 'utils/releasetools/build-config.json' workflow_dispatch: inputs: version: @@ -11,6 +16,7 @@ on: required: true permissions: + id-token: write contents: read jobs: @@ -20,8 +26,8 @@ jobs: runs-on: ubuntu-latest outputs: version: ${{ steps.get_version.outputs.VERSION }} + is_test: ${{ steps.check-if-testing.outputs.IS_TEST }} steps: - - run: | echo "Version: ${{ inputs.version || github.ref_name }}" shell: bash @@ -32,8 +38,13 @@ jobs: - name: Get the version id: get_version run: | - VERSION="${INPUT_VERSION}" + if [[ "${{ github.event_name }}" == "push" ]]; then + VERSION=${{ github.ref_name }} + else + VERSION="${INPUT_VERSION}" + fi if [ -z "${VERSION}" ]; then + echo "Error: No version specified" exit 1 fi echo "VERSION=$VERSION" >> $GITHUB_OUTPUT @@ -43,6 +54,16 @@ jobs: # only ever be a tag INPUT_VERSION: ${{ inputs.version || github.ref_name }} + - name: Check if we are testing + id: check-if-testing + run: | + if [[ "${{ github.event_name }}" == "push" ]]; then + echo "IS_TEST=true" >> $GITHUB_OUTPUT + else + echo "IS_TEST=false" >> $GITHUB_OUTPUT + fi + shell: bash + generate-build-matrix: name: Generating build matrix runs-on: ubuntu-latest @@ -56,7 +77,7 @@ jobs: - uses: ./.github/actions/generate-package-build-matrix id: set-matrix with: - ref: ${{ inputs.version || github.ref_name }} + ref: ${{ needs.release-build-get-meta.outputs.version }} release-build-linux-x86-packages: needs: @@ -67,11 +88,10 @@ jobs: version: ${{ needs.release-build-get-meta.outputs.version }} ref: ${{ inputs.version || github.ref_name }} build_matrix: ${{ needs.generate-build-matrix.outputs.x86_64-build-matrix }} + region: us-west-2 secrets: - token: ${{ secrets.GITHUB_TOKEN }} - bucket: ${{ secrets.AWS_S3_BUCKET }} - access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }} - secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }} + bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }} + role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} release-build-linux-arm-packages: needs: @@ -82,8 +102,7 @@ jobs: version: ${{ needs.release-build-get-meta.outputs.version }} ref: ${{ inputs.version || github.ref_name }} build_matrix: ${{ needs.generate-build-matrix.outputs.arm64-build-matrix }} + region: us-west-2 secrets: - token: ${{ secrets.GITHUB_TOKEN }} - bucket: ${{ secrets.AWS_S3_BUCKET }} - access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }} - secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }} + bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }} + role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} diff --git a/.github/workflows/call-build-linux-arm-packages.yml b/.github/workflows/call-build-linux-arm-packages.yml index 2a7bcc533f..65445a83c8 100644 --- a/.github/workflows/call-build-linux-arm-packages.yml +++ b/.github/workflows/call-build-linux-arm-packages.yml @@ -15,21 +15,20 @@ on: description: The build targets to produce as a JSON matrix. type: string required: true + region: + description: The AWS region to push packages into. + type: string + required: true secrets: - token: - description: The Github token or similar to authenticate with. + bucket_name: + description: The S3 bucket to push packages into. + required: true + role_to_assume: + description: The role to assume for the S3 bucket. required: true - bucket: - description: The name of the S3 bucket to push packages into. - required: false - access_key_id: - description: The S3 access key id for the bucket. - required: false - secret_access_key: - description: The S3 secret access key for the bucket. - required: false permissions: + id-token: write contents: read jobs: @@ -46,6 +45,12 @@ jobs: with: ref: ${{ inputs.version }} + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ inputs.region }} + role-to-assume: ${{ secrets.role_to_assume }} + - name: Make Valkey uses: uraimo/run-on-arch-action@v2 with: @@ -65,15 +70,5 @@ jobs: mkdir -p packages-files cp -rfv $TAR_FILE_NAME.tar* packages-files/ - - name: Install AWS cli. - run: | - sudo apt-get install -y awscli - - - name: Configure AWS credentials - run: | - aws configure set region us-west-2 - aws configure set aws_access_key_id ${{ secrets.access_key_id }} - aws configure set aws_secret_access_key ${{ secrets.secret_access_key }} - - name: Sync to S3 - run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/ + run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/ diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml index 9e438fa61a..a603c53c13 100644 --- a/.github/workflows/call-build-linux-x86-packages.yml +++ b/.github/workflows/call-build-linux-x86-packages.yml @@ -15,21 +15,20 @@ on: description: The build targets to produce as a JSON matrix. type: string required: true + region: + description: The AWS region to upload the packages to. + type: string + required: true secrets: - token: - description: The Github token or similar to authenticate with. + bucket_name: + description: The name of the S3 bucket to upload the packages to. + required: true + role_to_assume: + description: The role to assume for the S3 bucket. required: true - bucket: - description: The name of the S3 bucket to push packages into. - required: false - access_key_id: - description: The S3 access key id for the bucket. - required: false - secret_access_key: - description: The S3 secret access key for the bucket. - required: false permissions: + id-token: write contents: read jobs: @@ -46,6 +45,12 @@ jobs: with: ref: ${{ inputs.version }} + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ inputs.region }} + role-to-assume: ${{ secrets.role_to_assume }} + - name: Install dependencies run: sudo apt-get update && sudo apt-get install -y build-essential libssl-dev libsystemd-dev @@ -63,15 +68,5 @@ jobs: mkdir -p packages-files cp -rfv $TAR_FILE_NAME.tar* packages-files/ - - name: Install AWS cli. - run: | - sudo apt-get install -y awscli - - - name: Configure AWS credentials - run: | - aws configure set region us-west-2 - aws configure set aws_access_key_id ${{ secrets.access_key_id }} - aws configure set aws_secret_access_key ${{ secrets.secret_access_key }} - - name: Sync to S3 - run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/ + run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/ From fa97d89f766371e05f8329d88422b7db7845e5da Mon Sep 17 00:00:00 2001 From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:46:35 -0800 Subject: [PATCH 007/101] Fix Valkey binary build workflow, version support changes. (#1429) This change makes the binary build on the target ubuntu version. This PR also deprecated ubuntu18 and valkey will not support: - X86: - Ubuntu 20 - Ubuntu 22 - Ubuntu 24 - ARM: - Ubuntu 20 - Ubuntu 22 Removed ARM ubuntu 24 as the action we are using for ARM builds does not support Ubuntu 24. --------- Signed-off-by: Roshan Khatri --- .../call-build-linux-x86-packages.yml | 2 +- utils/releasetools/build-config.json | 21 +++++++------------ 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml index a603c53c13..4e68bf85f0 100644 --- a/.github/workflows/call-build-linux-x86-packages.yml +++ b/.github/workflows/call-build-linux-x86-packages.yml @@ -35,7 +35,7 @@ jobs: build-valkey: # Capture source tarball and generate checksum for it name: Build package ${{ matrix.distro.target }} ${{ matrix.distro.arch }} - runs-on: "ubuntu-latest" + runs-on: ${{matrix.distro.target}} strategy: fail-fast: false matrix: ${{ fromJSON(inputs.build_matrix) }} diff --git a/utils/releasetools/build-config.json b/utils/releasetools/build-config.json index 5e39fae70f..f64bf601ca 100644 --- a/utils/releasetools/build-config.json +++ b/utils/releasetools/build-config.json @@ -1,29 +1,24 @@ { "linux_targets": [ + { "arch": "x86_64", - "target": "ubuntu18.04", + "target": "ubuntu-20.04", "type": "deb", - "platform": "bionic" + "platform": "focal" }, { "arch": "x86_64", - "target": "ubuntu20.04", + "target": "ubuntu-22.04", "type": "deb", - "platform": "focal" + "platform": "jammy" }, { "arch": "x86_64", - "target": "ubuntu24.04", + "target": "ubuntu-24.04", "type": "deb", "platform": "noble" }, - { - "arch": "arm64", - "target": "ubuntu18.04", - "type": "deb", - "platform": "bionic" - }, { "arch": "arm64", "target": "ubuntu20.04", @@ -32,9 +27,9 @@ }, { "arch": "arm64", - "target": "ubuntu24.04", + "target": "ubuntu22.04", "type": "deb", - "platform": "noble" + "platform": "jammy" } ] } \ No newline at end of file From efa90ff10eade0236ce1230a1fa24c1f7451c731 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 12 Dec 2024 14:55:57 -0800 Subject: [PATCH 008/101] defrag: eliminate persistent kvstore pointer and edge case fixes (#1430) This update addresses several issues in defrag: 1. In the defrag redesign (https://github.com/valkey-io/valkey/pull/1242), a bug was introduced where `server.cronloops` was no longer being incremented in the `whileBlockedCron()`. This resulted in some memory statistics not being updated while blocked. 2. In the test case for AOF loading, we were seeing errors due to defrag latencies. However, running the math, the latencies are justified given the extremely high CPU target of the testcase. Adjusted the expected latency check to allow longer latencies for this case where defrag is undergoing starvation while AOF loading is in progress. 3. A "stage" is passed a "target". For the main dictionary and expires, we were passing in a `kvstore*`. However, on flushall or swapdb, the pointer may change. It's safer and more stable to use an index for the DB (a DBID). Then if the pointer changes, we can detect the change, and simply abort the stage. (If there's still fragmentation to deal with, we'll pick it up again on the next cycle.) 4. We always start a new stage on a new defrag cycle. This gives the new stage time to run, and prevents latency issues for certain stages which don't operate incrementally. However, often several stages will require almost no work, and this will leave a chunk of our CPU allotment unused. This is mainly an issue in starvation situations (like AOF loading or LUA script) - where defrag is running infrequently, with a large duty-cycle. This change allows a new stage to be initiated if we still have a standard duty-cycle remaining. (This can happen during starvation situations where the planned duty cycle is larger than the standard cycle. Most likely this isn't a concern for real scenarios, but it was observed in testing.) 5. Minor comment correction in `server.h` Signed-off-by: Jim Brunner --- src/defrag.c | 67 ++++++++++++++++++++++-------------- src/server.c | 6 ++++ src/server.h | 3 +- tests/unit/memefficiency.tcl | 8 +++-- 4 files changed, 53 insertions(+), 31 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index be7ff07510..8c1ad29de2 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -121,7 +121,7 @@ typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdat // Private data for main dictionary keys typedef struct { kvstoreIterState kvstate; - serverDb *db; + int dbid; } defragKeysCtx; static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this"); @@ -736,7 +736,7 @@ static void defragModule(serverDb *db, robj *obj) { /* for each key we scan in the main dict, this function will attempt to defrag * all the various pointers it has. */ static void defragKey(defragKeysCtx *ctx, robj **elemref) { - serverDb *db = ctx->db; + serverDb *db = &server.db[ctx->dbid]; int slot = ctx->kvstate.slot; robj *newob, *ob; unsigned char *newzl; @@ -920,7 +920,7 @@ static doneStatus defragLaterStep(monotime endtime, void *privdata) { robj *ob = found; long long key_defragged = server.stat_active_defrag_hits; - bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->db->id) == 1); + bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->dbid) == 1); if (key_defragged != server.stat_active_defrag_hits) { server.stat_active_defrag_key_hits++; } else { @@ -963,7 +963,10 @@ static doneStatus defragStageKvstoreHelper(monotime endtime, state.cursor = 0; return DEFRAG_NOT_DONE; } - serverAssert(kvs == state.kvs); // Shouldn't change during the stage + if (kvs != state.kvs) { + // There has been a change of the kvs (flushdb, swapdb, etc.). Just complete the stage. + return DEFRAG_DONE; + } unsigned int iterations = 0; unsigned long long prev_defragged = server.stat_active_defrag_hits; @@ -1013,26 +1016,30 @@ static doneStatus defragStageKvstoreHelper(monotime endtime, } -// Note: target is a DB, (not a KVS like most stages) +// Target is a DBID static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privdata) { UNUSED(privdata); - serverDb *db = (serverDb *)target; + int dbid = (uintptr_t)target; + serverDb *db = &server.db[dbid]; static defragKeysCtx ctx; // STATIC - this persists if (endtime == 0) { - ctx.db = db; + ctx.dbid = dbid; // Don't return yet. Call the helper with endtime==0 below. } - serverAssert(ctx.db == db); + serverAssert(ctx.dbid == dbid); return defragStageKvstoreHelper(endtime, db->keys, dbKeysScanCallback, defragLaterStep, &ctx); } +// Target is a DBID static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) { UNUSED(privdata); - return defragStageKvstoreHelper(endtime, (kvstore *)target, + int dbid = (uintptr_t)target; + serverDb *db = &server.db[dbid]; + return defragStageKvstoreHelper(endtime, db->expires, scanHashtableCallbackCountScanned, NULL, NULL); } @@ -1226,29 +1233,38 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i } monotime starttime = getMonotonicUs(); - monotime endtime = starttime + computeDefragCycleUs(); + int dutyCycleUs = computeDefragCycleUs(); + monotime endtime = starttime + dutyCycleUs; + bool haveMoreWork = true; mstime_t latency; latencyStartMonitor(latency); - if (!defrag.current_stage) { - defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages)); - listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages)); - // Initialize the stage with endtime==0 - doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata); - serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE - } + do { + if (!defrag.current_stage) { + defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages)); + listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages)); + // Initialize the stage with endtime==0 + doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata); + serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE + } - doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata); - if (status == DEFRAG_DONE) { - zfree(defrag.current_stage); - defrag.current_stage = NULL; - } + doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata); + if (status == DEFRAG_DONE) { + zfree(defrag.current_stage); + defrag.current_stage = NULL; + } + + haveMoreWork = (defrag.current_stage || listLength(defrag.remaining_stages) > 0); + /* If we've completed a stage early, and still have a standard time allotment remaining, + * we'll start another stage. This can happen when defrag is running infrequently, and + * starvation protection has increased the duty-cycle. */ + } while (haveMoreWork && getMonotonicUs() <= endtime - server.active_defrag_cycle_us); latencyEndMonitor(latency); latencyAddSampleIfNeeded("active-defrag-cycle", latency); - if (defrag.current_stage || listLength(defrag.remaining_stages) > 0) { + if (haveMoreWork) { return computeDelayMs(endtime); } else { endDefragCycle(true); @@ -1287,9 +1303,8 @@ static void beginDefragCycle(void) { defrag.remaining_stages = listCreate(); for (int dbid = 0; dbid < server.dbnum; dbid++) { - serverDb *db = &server.db[dbid]; - addDefragStage(defragStageDbKeys, db, NULL); - addDefragStage(defragStageExpiresKvstore, db->expires, NULL); + addDefragStage(defragStageDbKeys, (void *)(uintptr_t)dbid, NULL); + addDefragStage(defragStageExpiresKvstore, (void *)(uintptr_t)dbid, NULL); } static getClientChannelsFnWrapper getClientPubSubChannelsFn = {getClientPubSubChannels}; diff --git a/src/server.c b/src/server.c index 518ecad603..e495730fe2 100644 --- a/src/server.c +++ b/src/server.c @@ -1669,6 +1669,12 @@ void whileBlockedCron(void) { * latency monitor if this function is called too often. */ if (server.blocked_last_cron >= server.mstime) return; + /* Increment server.cronloops so that run_with_period works. */ + long hz_ms = 1000 / server.hz; + int cronloops = (server.mstime - server.blocked_last_cron + (hz_ms - 1)) / hz_ms; // rounding up + server.blocked_last_cron += cronloops * hz_ms; + server.cronloops += cronloops; + mstime_t latency; latencyStartMonitor(latency); diff --git a/src/server.h b/src/server.h index 3ba7a61b7d..88afb57c81 100644 --- a/src/server.h +++ b/src/server.h @@ -1900,8 +1900,7 @@ struct valkeyServer { int sanitize_dump_payload; /* Enables deep sanitization for ziplist and listpack in RDB and RESTORE. */ int skip_checksum_validation; /* Disable checksum validation for RDB and RESTORE payload. */ int jemalloc_bg_thread; /* Enable jemalloc background thread */ - int active_defrag_configuration_changed; /* defrag configuration has been changed and need to reconsider - * active_defrag_running in computeDefragCycles. */ + int active_defrag_configuration_changed; /* Config changed; need to recompute active_defrag_cpu_percent. */ size_t active_defrag_ignore_bytes; /* minimum amount of fragmentation waste to start active defrag */ int active_defrag_threshold_lower; /* minimum percentage of fragmentation to start active defrag */ int active_defrag_threshold_upper; /* maximum percentage of fragmentation at which we use maximum effort */ diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index ce74b7c618..78a68a682d 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -172,10 +172,12 @@ run_solo {defrag} { # make sure the defragger did enough work to keep the fragmentation low during loading. # we cannot check that it went all the way down, since we don't wait for full defrag cycle to complete. assert {$frag < 1.4} - # since the AOF contains simple (fast) SET commands (and the cron during loading runs every 1024 commands), - # it'll still not block the loading for long periods of time. + # The AOF contains simple (fast) SET commands (and the cron during loading runs every 1024 commands). + # Even so, defrag can get starved for periods exceeding 100ms. Using 200ms for test stability, and + # a 75% CPU requirement (as set above), we should allow up to 600ms latency + # (as total time = 200 non duty + 600 duty = 800ms, and 75% of 800ms is 600ms). if {!$::no_latency} { - assert {$max_latency <= 40} + assert {$max_latency <= 600} } } } ;# Active defrag - AOF loading From c962a3e1496f1ca67ef361e9d36e7c12e37b9659 Mon Sep 17 00:00:00 2001 From: Thalia Archibald Date: Fri, 13 Dec 2024 02:05:19 -0800 Subject: [PATCH 009/101] Check length before reading in `stringmatchlen` (#1431) Fixes four cases where `stringmatchlen` could overrun the pattern if it is not terminated with NUL. These commits are cherry-picked from my [fork](https://github.com/thaliaarchi/antirez-stringmatch) which extracts `stringmatch` as a library and compares it to other projects by antirez which use the same matcher. Signed-off-by: Thalia Archibald --- src/util.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/util.c b/src/util.c index 0b7af2d3fa..6d99d47e5a 100644 --- a/src/util.c +++ b/src/util.c @@ -104,23 +104,23 @@ static int stringmatchlen_impl(const char *pattern, pattern++; patternLen--; - not_op = pattern[0] == '^'; + not_op = patternLen && pattern[0] == '^'; if (not_op) { pattern++; patternLen--; } match = 0; while (1) { - if (pattern[0] == '\\' && patternLen >= 2) { + if (patternLen >= 2 && pattern[0] == '\\') { pattern++; patternLen--; if (pattern[0] == string[0]) match = 1; - } else if (pattern[0] == ']') { - break; } else if (patternLen == 0) { pattern--; patternLen++; break; + } else if (pattern[0] == ']') { + break; } else if (patternLen >= 3 && pattern[1] == '-') { int start = pattern[0]; int end = pattern[2]; @@ -173,7 +173,7 @@ static int stringmatchlen_impl(const char *pattern, pattern++; patternLen--; if (stringLen == 0) { - while (*pattern == '*') { + while (patternLen && *pattern == '*') { pattern++; patternLen--; } From 2b733719b1e09c979a9157c8e92547d093178ae7 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 14 Dec 2024 05:32:54 +0800 Subject: [PATCH 010/101] Skip build-release-packages CI job in forks (#1438) The CI job was introduced in #1363, we should skip it in forks. Signed-off-by: Binbin --- .github/workflows/build-release-packages.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml index 44e012d658..c7d5c8fe54 100644 --- a/.github/workflows/build-release-packages.yml +++ b/.github/workflows/build-release-packages.yml @@ -23,6 +23,7 @@ jobs: # This job provides the version metadata from the tag for the other jobs to use. release-build-get-meta: name: Get metadata to build + if: github.repository == 'valkey-io/valkey' runs-on: ubuntu-latest outputs: version: ${{ steps.get_version.outputs.VERSION }} @@ -66,6 +67,7 @@ jobs: generate-build-matrix: name: Generating build matrix + if: github.repository == 'valkey-io/valkey' runs-on: ubuntu-latest outputs: x86_64-build-matrix: ${{ steps.set-matrix.outputs.x86_64-build-matrix }} From 630327cbecc5ba28a0a82357a7ecc4e212825d6f Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 14 Dec 2024 06:26:20 +0800 Subject: [PATCH 011/101] Fix wrong file name in build-release-packages.yml (#1437) Introduced in #1363, the file name does not match. Signed-off-by: Binbin --- .github/workflows/build-release-packages.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml index c7d5c8fe54..6c54971bcd 100644 --- a/.github/workflows/build-release-packages.yml +++ b/.github/workflows/build-release-packages.yml @@ -7,7 +7,7 @@ on: paths: - '.github/workflows/build-release-packages.yml' - '.github/workflows/call-build-linux-arm-packages.yml' - - '.github/workflows/call-build-linux-x86_64-packages.yml' + - '.github/workflows/call-build-linux-x86-packages.yml' - 'utils/releasetools/build-config.json' workflow_dispatch: inputs: From 5bf8a6b40a0af683cf6dbaa3b5d9911c5267bcad Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Sat, 14 Dec 2024 10:13:04 -0800 Subject: [PATCH 012/101] Avoid importing memory aligned malloc (#1442) We deprecate the usage of classic malloc and free, but under certain circumstances they might get imported from intrinsics. The original thought is we should just override malloc and free to use zmalloc and zfree, but I think we should continue to deprecate it to avoid accidental imports of allocations. Closes https://github.com/valkey-io/valkey/issues/1434. --------- Signed-off-by: Madelyn Olson --- src/hyperloglog.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index f0390b3e1e..6056bc0098 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -36,6 +36,9 @@ #include #ifdef HAVE_AVX2 +/* Define __MM_MALLOC_H to prevent importing the memory aligned + * allocation functions, which we don't use. */ +#define __MM_MALLOC_H #include #endif From 8faf3c3c17e6340e70659ad24157c48fc76ae449 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Sat, 14 Dec 2024 10:14:01 -0800 Subject: [PATCH 013/101] Synchronously delete data during defrag tests (#1443) The creation of fragmentation is delayed when we use lazy-free. You can induce some of the active-defrag tests to fail by artificially adding a delay in the lazyfree process, similar to the issues seen in #1433 and issues like https://github.com/valkey-io/valkey/actions/runs/12267010712/job/34226304803#step:7:6538. The solution is to always do sync free during tests. Might close https://github.com/valkey-io/valkey/issues/1433. Signed-off-by: Madelyn Olson --- tests/unit/memefficiency.tcl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 78a68a682d..8f6e5e8dd3 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -47,6 +47,8 @@ run_solo {defrag} { r config set active-defrag-ignore-bytes 2mb r config set maxmemory 100mb r config set maxmemory-policy allkeys-lru + r config set lazyfree-lazy-user-del no + r config set lazyfree-lazy-user-flush no populate 700000 asdf1 150 populate 100 asdf1 150 0 false 1000 From 3931ee48c392fae4eb23184a10afb5a111cf69d5 Mon Sep 17 00:00:00 2001 From: Rain Valentine Date: Sat, 14 Dec 2024 11:53:48 -0800 Subject: [PATCH 014/101] Replace dict with new hashtable for sets datatype (#1176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new `hashtable` provides faster lookups and uses less memory than `dict`. A TCL test case "SRANDMEMBER with a dict containing long chain" is deleted because it's covered by a hashtable unit test "test_random_entry_with_long_chain", which is already present. This change also moves some logic from dismissMemory (object.c) to zmadvise_dontneed (zmalloc.c), so the hashtable implementation which needs the dismiss functionality doesn't need to depend on object.c and server.h. This PR follows #1186. --------- Signed-off-by: Rain Valentine Signed-off-by: Viktor Söderqvist Co-authored-by: Viktor Söderqvist --- src/db.c | 72 +++++++++++------ src/debug.c | 29 ++++--- src/defrag.c | 42 ++++++---- src/hashtable.c | 10 ++- src/hashtable.h | 3 +- src/lazyfree.c | 6 +- src/module.c | 39 ++++++--- src/object.c | 56 +++++++------ src/rdb.c | 42 +++++----- src/server.c | 57 +++++-------- src/server.h | 9 ++- src/t_set.c | 172 ++++++++++++++++++++-------------------- src/t_zset.c | 24 +++--- src/zmalloc.c | 19 ++++- src/zmalloc.h | 2 +- tests/unit/info.tcl | 8 +- tests/unit/type/set.tcl | 107 +------------------------ 17 files changed, 326 insertions(+), 371 deletions(-) diff --git a/src/db.c b/src/db.c index 2bd40ba74b..1223d00c8d 100644 --- a/src/db.c +++ b/src/db.c @@ -978,7 +978,7 @@ void keysScanCallback(void *privdata, void *entry) { /* This callback is used by scanGenericCommand in order to collect elements * returned by the dictionary iterator into a list. */ -void scanCallback(void *privdata, const dictEntry *de) { +void dictScanCallback(void *privdata, const dictEntry *de) { scanData *data = (scanData *)privdata; list *keys = data->keys; robj *o = data->o; @@ -998,9 +998,7 @@ void scanCallback(void *privdata, const dictEntry *de) { } } - if (o->type == OBJ_SET) { - key = keysds; - } else if (o->type == OBJ_HASH) { + if (o->type == OBJ_HASH) { key = keysds; if (!data->only_keys) { val = dictGetVal(de); @@ -1013,13 +1011,33 @@ void scanCallback(void *privdata, const dictEntry *de) { val = sdsnewlen(buf, len); } } else { - serverPanic("Type not handled in SCAN callback."); + serverPanic("Type not handled in dict SCAN callback."); } listAddNodeTail(keys, key); if (val) listAddNodeTail(keys, val); } +void hashtableScanCallback(void *privdata, void *entry) { + scanData *data = (scanData *)privdata; + robj *o = data->o; + list *keys = data->keys; + data->sampled++; + + /* currently only implemented for SET scan */ + serverAssert(o && o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE); + sds key = (sds)entry; /* Specific for OBJ_SET */ + + /* Filter element if it does not match the pattern. */ + if (data->pattern) { + if (!stringmatchlen(data->pattern, sdslen(data->pattern), key, sdslen(key), 0)) { + return; + } + } + + listAddNodeTail(keys, key); +} + /* Try to parse a SCAN cursor stored at object 'o': * if the cursor is valid, store it as unsigned integer into *cursor and * returns C_OK. Otherwise return C_ERR and send an error to the @@ -1083,7 +1101,6 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { sds typename = NULL; long long type = LLONG_MAX; int patlen = 0, use_pattern = 0, only_keys = 0; - dict *ht; /* Object must be NULL (to iterate keys names), or the type of the object * must be Set, Sorted Set, or Hash. */ @@ -1152,34 +1169,35 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * just return everything inside the object in a single call, setting the * cursor to zero to signal the end of the iteration. */ - /* Handle the case of a hash table. */ - ht = NULL; + /* Handle the case of kvstore, dict or hashtable. */ + dict *dict_table = NULL; + hashtable *hashtable_table = NULL; + int shallow_copied_list_items = 0; if (o == NULL) { - ht = NULL; - } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HT) { - ht = o->ptr; + shallow_copied_list_items = 1; + } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable_table = o->ptr; + shallow_copied_list_items = 1; } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) { - ht = o->ptr; + dict_table = o->ptr; + shallow_copied_list_items = 1; } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - ht = zs->dict; + dict_table = zs->dict; + /* scanning ZSET allocates temporary strings even though it's a dict */ + shallow_copied_list_items = 0; } list *keys = listCreate(); - /* Set a free callback for the contents of the collected keys list. - * For the main keyspace dict, and when we scan a key that's dict encoded - * (we have 'ht'), we don't need to define free method because the strings - * in the list are just a shallow copy from the pointer in the dictEntry. - * When scanning a key with other encodings (e.g. listpack), we need to - * free the temporary strings we add to that list. - * The exception to the above is ZSET, where we do allocate temporary - * strings even when scanning a dict. */ - if (o && (!ht || o->type == OBJ_ZSET)) { + /* Set a free callback for the contents of the collected keys list if they + * are deep copied temporary strings. We must not free them if they are just + * a shallow copy - a pointer to the actual data in the data structure */ + if (!shallow_copied_list_items) { listSetFreeMethod(keys, (void (*)(void *))sdsfree); } - /* For main dictionary scan or data structure using hashtable. */ - if (!o || ht) { + /* For main hash table scan or scannable data structure. */ + if (!o || dict_table || hashtable_table) { /* We set the max number of iterations to ten times the specified * COUNT, so if the hash table is in a pathological state (very * sparsely populated) we avoid to block too much time at the cost @@ -1188,7 +1206,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { /* We pass scanData which have three pointers to the callback: * 1. data.keys: the list to which it will add new elements; - * 2. data.o: the object containing the dictionary so that + * 2. data.o: the object containing the hash table so that * it is possible to fetch more data in a type-dependent way; * 3. data.type: the specified type scan in the db, LLONG_MAX means * type matching is no needed; @@ -1219,8 +1237,10 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * If cursor is empty, we should try exploring next non-empty slot. */ if (o == NULL) { cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data); + } else if (dict_table) { + cursor = dictScan(dict_table, cursor, dictScanCallback, &data); } else { - cursor = dictScan(ht, cursor, scanCallback, &data); + cursor = hashtableScan(hashtable_table, cursor, hashtableScanCallback, &data); } } while (cursor && maxiterations-- && data.sampled < count); } else if (o->type == OBJ_SET) { diff --git a/src/debug.c b/src/debug.c index d63d12f762..4efe12e237 100644 --- a/src/debug.c +++ b/src/debug.c @@ -916,30 +916,35 @@ void debugCommand(client *c) { addReplyVerbatim(c, stats, sdslen(stats), "txt"); sdsfree(stats); } else if (!strcasecmp(c->argv[1]->ptr, "htstats-key") && c->argc >= 3) { - robj *o; - dict *ht = NULL; int full = 0; - if (c->argc >= 4 && !strcasecmp(c->argv[3]->ptr, "full")) full = 1; - if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr)) == NULL) return; + robj *o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr); + if (o == NULL) return; - /* Get the hash table reference from the object, if possible. */ + /* Get the dict reference from the object, if possible. */ + dict *d = NULL; + hashtable *ht = NULL; switch (o->encoding) { case OBJ_ENCODING_SKIPLIST: { zset *zs = o->ptr; - ht = zs->dict; + d = zs->dict; } break; - case OBJ_ENCODING_HT: ht = o->ptr; break; + case OBJ_ENCODING_HT: d = o->ptr; break; + case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break; } - if (ht == NULL) { - addReplyError(c, "The value stored at the specified key is not " - "represented using an hash table"); - } else { + if (d != NULL) { char buf[4096]; - dictGetStats(buf, sizeof(buf), ht, full); + dictGetStats(buf, sizeof(buf), d, full); addReplyVerbatim(c, buf, strlen(buf), "txt"); + } else if (ht != NULL) { + char buf[4096]; + hashtableGetStats(buf, sizeof(buf), ht, full); + addReplyVerbatim(c, buf, strlen(buf), "txt"); + } else { + addReplyError(c, "The value stored at the specified key is not " + "represented using an hash table"); } } else if (!strcasecmp(c->argv[1]->ptr, "change-repl-id") && c->argc == 2) { serverLog(LL_NOTICE, "Changing replication IDs after receiving DEBUG change-repl-id"); diff --git a/src/defrag.c b/src/defrag.c index 8c1ad29de2..8e7fc8449e 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -34,6 +34,7 @@ */ #include "server.h" +#include "hashtable.h" #include "script.h" #include @@ -379,6 +380,20 @@ static void activeDefragSdsDict(dict *d, int val_type) { } while (cursor != 0); } +void activeDefragSdsHashtableCallback(void *privdata, void *entry_ref) { + UNUSED(privdata); + sds *sds_ref = (sds *)entry_ref; + sds new_sds = activeDefragSds(*sds_ref); + if (new_sds != NULL) *sds_ref = new_sds; +} + +void activeDefragSdsHashtable(hashtable *ht) { + unsigned long cursor = 0; + do { + cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); + } while (cursor != 0); +} + /* Defrag a list of ptr, sds or robj string values */ static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { quicklistNode *newnode, *node = *node_ref; @@ -497,11 +512,9 @@ static void scanCallbackCountScanned(void *privdata, const dictEntry *de) { } static void scanLaterSet(robj *ob, unsigned long *cursor) { - if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return; - dict *d = ob->ptr; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, - .defragKey = (dictDefragAllocFunction *)activeDefragSds}; - *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); + if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HASHTABLE) return; + hashtable *ht = ob->ptr; + *cursor = hashtableScanDefrag(ht, *cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); } static void scanLaterHash(robj *ob, unsigned long *cursor) { @@ -560,15 +573,16 @@ static void defragHash(robj *ob) { } static void defragSet(robj *ob) { - dict *d, *newd; - serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT); - d = ob->ptr; - if (dictSize(d) > server.active_defrag_max_scan_fields) + serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE); + hashtable *ht = ob->ptr; + if (hashtableSize(ht) > server.active_defrag_max_scan_fields) { defragLater(ob); - else - activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL); - /* defrag the dict struct and tables */ - if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; + } else { + activeDefragSdsHashtable(ht); + } + /* defrag the hashtable struct and tables */ + hashtable *newHashtable = hashtableDefragTables(ht, activeDefragAlloc); + if (newHashtable) ob->ptr = newHashtable; } /* Defrag callback for radix tree iterator, called for each node, @@ -766,7 +780,7 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) { serverPanic("Unknown list encoding"); } } else if (ob->type == OBJ_SET) { - if (ob->encoding == OBJ_ENCODING_HT) { + if (ob->encoding == OBJ_ENCODING_HASHTABLE) { defragSet(ob); } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) { void *newptr, *ptr = ob->ptr; diff --git a/src/hashtable.c b/src/hashtable.c index 9d963b9ddc..11ba360800 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -1023,7 +1023,7 @@ void *hashtableMetadata(hashtable *ht) { } /* Returns the number of entries stored. */ -size_t hashtableSize(hashtable *ht) { +size_t hashtableSize(const hashtable *ht) { return ht->used[0] + ht->used[1]; } @@ -1180,6 +1180,14 @@ hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)) { return ht1; } +/* Used for releasing memory to OS to avoid unnecessary CoW. Called when we've + * forked and memory won't be used again. See zmadvise_dontneed() */ +void dismissHashtable(hashtable *ht) { + for (int i = 0; i < 2; i++) { + zmadvise_dontneed(ht->tables[i], numBuckets(ht->bucket_exp[i]) * sizeof(bucket *)); + } +} + /* Returns 1 if an entry was found matching the key. Also points *found to it, * if found is provided. Returns 0 if no matching entry was found. */ int hashtableFind(hashtable *ht, const void *key, void **found) { diff --git a/src/hashtable.h b/src/hashtable.h index 242531df8f..4291cf5a5d 100644 --- a/src/hashtable.h +++ b/src/hashtable.h @@ -108,7 +108,7 @@ void hashtableRelease(hashtable *ht); void hashtableEmpty(hashtable *ht, void(callback)(hashtable *)); hashtableType *hashtableGetType(hashtable *ht); void *hashtableMetadata(hashtable *ht); -size_t hashtableSize(hashtable *ht); +size_t hashtableSize(const hashtable *ht); size_t hashtableBuckets(hashtable *ht); size_t hashtableChainedBuckets(hashtable *ht, int table); size_t hashtableMemUsage(hashtable *ht); @@ -123,6 +123,7 @@ int hashtableTryExpand(hashtable *ht, size_t size); int hashtableExpandIfNeeded(hashtable *ht); int hashtableShrinkIfNeeded(hashtable *ht); hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)); +void dismissHashtable(hashtable *ht); /* Entries */ int hashtableFind(hashtable *ht, const void *key, void **found); diff --git a/src/lazyfree.c b/src/lazyfree.c index 14a4454d7a..4b4c7f06ad 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -116,9 +116,9 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) { if (obj->type == OBJ_LIST && obj->encoding == OBJ_ENCODING_QUICKLIST) { quicklist *ql = obj->ptr; return ql->len; - } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HT) { - dict *ht = obj->ptr; - return dictSize(ht); + } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = obj->ptr; + return hashtableSize(ht); } else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = obj->ptr; return zs->zsl->length; diff --git a/src/module.c b/src/module.c index 9bcf68646e..36283e2c73 100644 --- a/src/module.c +++ b/src/module.c @@ -11017,20 +11017,20 @@ typedef struct { ValkeyModuleScanKeyCB fn; } ScanKeyCBData; -static void moduleScanKeyCallback(void *privdata, const dictEntry *de) { +static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) { ScanKeyCBData *data = privdata; sds key = dictGetKey(de); robj *o = data->key->value; robj *field = createStringObject(key, sdslen(key)); robj *value = NULL; - if (o->type == OBJ_SET) { - value = NULL; - } else if (o->type == OBJ_HASH) { + if (o->type == OBJ_HASH) { sds val = dictGetVal(de); value = createStringObject(val, sdslen(val)); } else if (o->type == OBJ_ZSET) { double *val = (double *)dictGetVal(de); value = createStringObjectFromLongDouble(*val, 0); + } else { + serverPanic("unexpected object type"); } data->fn(data->key, field, value, data->user_data); @@ -11038,6 +11038,17 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) { if (value) decrRefCount(value); } +static void moduleScanKeyHashtableCallback(void *privdata, void *entry) { + ScanKeyCBData *data = privdata; + robj *o = data->key->value; + serverAssert(o->type == OBJ_SET); + sds key = entry; + robj *field = createStringObject(key, sdslen(key)); + + data->fn(data->key, field, NULL, data->user_data); + decrRefCount(field); +} + /* Scan api that allows a module to scan the elements in a hash, set or sorted set key * * Callback for scan implementation. @@ -11091,14 +11102,15 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul errno = EINVAL; return 0; } - dict *ht = NULL; + dict *d = NULL; + hashtable *ht = NULL; robj *o = key->value; if (o->type == OBJ_SET) { - if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr; + if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr; } else if (o->type == OBJ_HASH) { - if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr; + if (o->encoding == OBJ_ENCODING_HT) d = o->ptr; } else if (o->type == OBJ_ZSET) { - if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->dict; + if (o->encoding == OBJ_ENCODING_SKIPLIST) d = ((zset *)o->ptr)->dict; } else { errno = EINVAL; return 0; @@ -11108,9 +11120,16 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul return 0; } int ret = 1; - if (ht) { + if (d) { + ScanKeyCBData data = {key, privdata, fn}; + cursor->cursor = dictScan(d, cursor->cursor, moduleScanKeyDictCallback, &data); + if (cursor->cursor == 0) { + cursor->done = 1; + ret = 0; + } + } else if (ht) { ScanKeyCBData data = {key, privdata, fn}; - cursor->cursor = dictScan(ht, cursor->cursor, moduleScanKeyCallback, &data); + cursor->cursor = hashtableScan(ht, cursor->cursor, moduleScanKeyHashtableCallback, &data); if (cursor->cursor == 0) { cursor->done = 1; ret = 0; diff --git a/src/object.c b/src/object.c index ac1c26adf9..15363f31b8 100644 --- a/src/object.c +++ b/src/object.c @@ -429,9 +429,9 @@ robj *createListListpackObject(void) { } robj *createSetObject(void) { - dict *d = dictCreate(&setDictType); - robj *o = createObject(OBJ_SET, d); - o->encoding = OBJ_ENCODING_HT; + hashtable *ht = hashtableCreate(&setHashtableType); + robj *o = createObject(OBJ_SET, ht); + o->encoding = OBJ_ENCODING_HASHTABLE; return o; } @@ -506,7 +506,7 @@ void freeListObject(robj *o) { void freeSetObject(robj *o) { switch (o->encoding) { - case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break; + case OBJ_ENCODING_HASHTABLE: hashtableRelease((hashtable *)o->ptr); break; case OBJ_ENCODING_INTSET: case OBJ_ENCODING_LISTPACK: zfree(o->ptr); break; default: serverPanic("Unknown set encoding type"); @@ -622,23 +622,23 @@ void dismissListObject(robj *o, size_t size_hint) { /* See dismissObject() */ void dismissSetObject(robj *o, size_t size_hint) { - if (o->encoding == OBJ_ENCODING_HT) { - dict *set = o->ptr; - serverAssert(dictSize(set) != 0); + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; + serverAssert(hashtableSize(ht) != 0); /* We iterate all nodes only when average member size is bigger than a * page size, and there's a high chance we'll actually dismiss something. */ - if (size_hint / dictSize(set) >= server.page_size) { - dictEntry *de; - dictIterator *di = dictGetIterator(set); - while ((de = dictNext(di)) != NULL) { - dismissSds(dictGetKey(de)); + if (size_hint / hashtableSize(ht) >= server.page_size) { + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + while (hashtableNext(&iter, &next)) { + sds item = next; + dismissSds(item); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } - /* Dismiss hash table memory. */ - dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0]) * sizeof(dictEntry *)); - dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1]) * sizeof(dictEntry *)); + dismissHashtable(ht); } else if (o->encoding == OBJ_ENCODING_INTSET) { dismissMemory(o->ptr, intsetBlobLen((intset *)o->ptr)); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { @@ -728,7 +728,7 @@ void dismissStreamObject(robj *o, size_t size_hint) { * modifies any keys due to write traffic, it'll cause CoW which consume * physical memory. In the child process, after serializing the key and value, * the data is definitely not accessed again, so to avoid unnecessary CoW, we - * try to release their memory back to OS. see dismissMemory(). + * try to release their memory back to OS. see zmadvise_dontneed(). * * Because of the cost of iterating all node/field/member/entry of complex data * types, we iterate and dismiss them only when approximate average we estimate @@ -1109,6 +1109,7 @@ char *strEncoding(int encoding) { case OBJ_ENCODING_RAW: return "raw"; case OBJ_ENCODING_INT: return "int"; case OBJ_ENCODING_HT: return "hashtable"; + case OBJ_ENCODING_HASHTABLE: return "hashtable"; case OBJ_ENCODING_QUICKLIST: return "quicklist"; case OBJ_ENCODING_LISTPACK: return "listpack"; case OBJ_ENCODING_INTSET: return "intset"; @@ -1160,17 +1161,20 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { serverPanic("Unknown list encoding"); } } else if (o->type == OBJ_SET) { - if (o->encoding == OBJ_ENCODING_HT) { - d = o->ptr; - di = dictGetIterator(d); - asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d)); - while ((de = dictNext(di)) != NULL && samples < sample_size) { - ele = dictGetKey(de); - elesize += dictEntryMemUsage(de) + sdsAllocSize(ele); + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; + asize = sizeof(*o) + hashtableMemUsage(ht); + + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + while (hashtableNext(&iter, &next) && samples < sample_size) { + sds element = next; + elesize += sdsAllocSize(element); samples++; } - dictReleaseIterator(di); - if (samples) asize += (double)elesize / samples * dictSize(d); + hashtableResetIterator(&iter); + if (samples) asize += (double)elesize / samples * hashtableSize(ht); } else if (o->encoding == OBJ_ENCODING_INTSET) { asize = sizeof(*o) + zmalloc_size(o->ptr); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { diff --git a/src/rdb.c b/src/rdb.c index 6e990736bc..5fb77a2897 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -692,7 +692,7 @@ int rdbSaveObjectType(rio *rdb, robj *o) { case OBJ_SET: if (o->encoding == OBJ_ENCODING_INTSET) return rdbSaveType(rdb, RDB_TYPE_SET_INTSET); - else if (o->encoding == OBJ_ENCODING_HT) + else if (o->encoding == OBJ_ENCODING_HASHTABLE) return rdbSaveType(rdb, RDB_TYPE_SET); else if (o->encoding == OBJ_ENCODING_LISTPACK) return rdbSaveType(rdb, RDB_TYPE_SET_LISTPACK); @@ -876,26 +876,26 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { } } else if (o->type == OBJ_SET) { /* Save a set value */ - if (o->encoding == OBJ_ENCODING_HT) { - dict *set = o->ptr; - dictIterator *di = dictGetIterator(set); - dictEntry *de; + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *set = o->ptr; - if ((n = rdbSaveLen(rdb, dictSize(set))) == -1) { - dictReleaseIterator(di); + if ((n = rdbSaveLen(rdb, hashtableSize(set))) == -1) { return -1; } nwritten += n; - while ((de = dictNext(di)) != NULL) { - sds ele = dictGetKey(de); + hashtableIterator iterator; + hashtableInitIterator(&iterator, set); + void *next; + while (hashtableNext(&iterator, &next)) { + sds ele = next; if ((n = rdbSaveRawString(rdb, (unsigned char *)ele, sdslen(ele))) == -1) { - dictReleaseIterator(di); + hashtableResetIterator(&iterator); return -1; } nwritten += n; } - dictReleaseIterator(di); + hashtableResetIterator(&iterator); } else if (o->encoding == OBJ_ENCODING_INTSET) { size_t l = intsetBlobLen((intset *)o->ptr); @@ -1909,8 +1909,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o = createSetObject(); /* It's faster to expand the dict to the right size asap in order * to avoid rehashing */ - if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr, len) != DICT_OK) { - rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); + if (!hashtableTryExpand(o->ptr, len)) { + rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len); decrRefCount(o); return NULL; } @@ -1949,8 +1949,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { * of many small ones. It's OK since lpSafeToAdd doesn't * care about individual elements, only the total size. */ setTypeConvert(o, OBJ_ENCODING_LISTPACK); - } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) { - rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); + } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) { + rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len); sdsfree(sdsele); decrRefCount(o); return NULL; @@ -1970,8 +1970,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { return NULL; } o->ptr = lpAppend(o->ptr, (unsigned char *)sdsele, elelen); - } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) { - rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); + } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) { + rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len); sdsfree(sdsele); decrRefCount(o); return NULL; @@ -1980,8 +1980,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* This will also be called when the set was just converted * to a regular hash table encoded set. */ - if (o->encoding == OBJ_ENCODING_HT) { - if (dictAdd((dict *)o->ptr, sdsele, NULL) != DICT_OK) { + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + if (!hashtableAdd((hashtable *)o->ptr, sdsele)) { rdbReportCorruptRDB("Duplicate set members detected"); decrRefCount(o); sdsfree(sdsele); @@ -2356,7 +2356,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } o->type = OBJ_SET; o->encoding = OBJ_ENCODING_INTSET; - if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HT); + if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE); break; case RDB_TYPE_SET_LISTPACK: if (deep_integrity_validation) server.stat_dump_payload_sanitizations++; @@ -2376,7 +2376,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { decrRefCount(o); goto emptykey; } - if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HT); + if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE); break; case RDB_TYPE_ZSET_ZIPLIST: { unsigned char *lp = lpNew(encoded_len); diff --git a/src/server.c b/src/server.c index e495730fe2..da06884eb1 100644 --- a/src/server.c +++ b/src/server.c @@ -372,6 +372,7 @@ void dictDictDestructor(void *val) { dictRelease((dict *)val); } +/* Returns 1 when keys match */ int dictSdsKeyCompare(const void *key1, const void *key2) { int l1, l2; l1 = sdslen((sds)key1); @@ -380,6 +381,12 @@ int dictSdsKeyCompare(const void *key1, const void *key2) { return memcmp(key1, key2, l1) == 0; } +/* Returns 0 when keys match */ +int hashtableSdsKeyCompare(const void *key1, const void *key2) { + const sds sds1 = (const sds)key1, sds2 = (const sds)key2; + return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2); +} + size_t dictSdsEmbedKey(unsigned char *buf, size_t buf_len, const void *key, uint8_t *key_offset) { return sdscopytobuffer(buf, buf_len, (sds)key, key_offset); } @@ -542,17 +549,11 @@ dictType objectKeyHeapPointerValueDictType = { NULL /* allow to expand */ }; -/* Set dictionary type. Keys are SDS strings, values are not used. */ -dictType setDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - dictSdsKeyCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - NULL, /* val destructor */ - NULL, /* allow to expand */ - .no_value = 1, /* no values in this dict */ - .keys_are_odd = 1 /* an SDS string is always an odd pointer */ -}; +/* Set hashtable type. Items are SDS strings */ +hashtableType setHashtableType = { + .hashFunction = dictSdsHash, + .keyCompare = hashtableSdsKeyCompare, + .entryDestructor = dictSdsDestructor}; /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */ dictType zsetDictType = { @@ -572,11 +573,6 @@ const void *hashtableObjectGetKey(const void *entry) { return objectGetKey(entry); } -int hashtableSdsKeyCompare(const void *key1, const void *key2) { - const sds sds1 = (const sds)key1, sds2 = (const sds)key2; - return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2); -} - int hashtableObjKeyCompare(const void *key1, const void *key2) { const robj *o1 = key1, *o2 = key2; return hashtableSdsKeyCompare(o1->ptr, o2->ptr); @@ -645,6 +641,11 @@ dictType sdsReplyDictType = { NULL /* allow to expand */ }; +/* Hashtable type without destructor */ +hashtableType sdsReplyHashtableType = { + .hashFunction = dictSdsCaseHash, + .keyCompare = hashtableSdsKeyCompare}; + /* Keylist hash table type has unencoded Objects as keys and * lists as values. It's used for blocking operations (BLPOP) and to * map swapped keys to a list of clients waiting for this keys to be loaded. */ @@ -6528,27 +6529,7 @@ void sendChildInfo(childInfoType info_type, size_t keys, char *pname) { sendChildInfoGeneric(info_type, keys, -1, pname); } -/* Try to release pages back to the OS directly (bypassing the allocator), - * in an effort to decrease CoW during fork. For small allocations, we can't - * release any full page, so in an effort to avoid getting the size of the - * allocation from the allocator (malloc_size) when we already know it's small, - * we check the size_hint. If the size is not already known, passing a size_hint - * of 0 will lead the checking the real size of the allocation. - * Also please note that the size may be not accurate, so in order to make this - * solution effective, the judgement for releasing memory pages should not be - * too strict. */ -void dismissMemory(void *ptr, size_t size_hint) { - if (ptr == NULL) return; - - /* madvise(MADV_DONTNEED) can not release pages if the size of memory - * is too small, we try to release only for the memory which the size - * is more than half of page size. */ - if (size_hint && size_hint <= server.page_size / 2) return; - - zmadvise_dontneed(ptr); -} - -/* Dismiss big chunks of memory inside a client structure, see dismissMemory() */ +/* Dismiss big chunks of memory inside a client structure, see zmadvise_dontneed() */ void dismissClientMemory(client *c) { /* Dismiss client query buffer and static reply buffer. */ dismissMemory(c->buf, c->buf_usable_size); @@ -6579,7 +6560,7 @@ void dismissClientMemory(client *c) { /* In the child process, we don't need some buffers anymore, and these are * likely to change in the parent when there's heavy write traffic. * We dismiss them right away, to avoid CoW. - * see dismissMemory(). */ + * see zmadvise_dontneed(). */ void dismissMemoryInChild(void) { /* madvise(MADV_DONTNEED) may not work if Transparent Huge Pages is enabled. */ if (server.thp_enabled) return; diff --git a/src/server.h b/src/server.h index 88afb57c81..b07144de92 100644 --- a/src/server.h +++ b/src/server.h @@ -83,6 +83,8 @@ typedef long long ustime_t; /* microsecond time type. */ #include "connection.h" /* Connection abstraction */ #include "memory_prefetch.h" +#define dismissMemory zmadvise_dontneed + #define VALKEYMODULE_CORE 1 typedef struct serverObject robj; #include "valkeymodule.h" /* Modules API defines. */ @@ -873,6 +875,7 @@ struct ValkeyModuleDigest { #define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of listpacks */ #define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */ #define OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */ +#define OBJ_ENCODING_HASHTABLE 12 /* Encoded as a hashtable */ #define LRU_BITS 24 #define LRU_CLOCK_MAX ((1 << LRU_BITS) - 1) /* Max value of obj->lru */ @@ -2635,7 +2638,7 @@ typedef struct { robj *subject; int encoding; int ii; /* intset iterator */ - dictIterator *di; + hashtableIterator *hashtable_iterator; unsigned char *lpi; /* listpack iterator */ } setTypeIterator; @@ -2666,7 +2669,7 @@ extern struct valkeyServer server; extern struct sharedObjectsStruct shared; extern dictType objectKeyPointerValueDictType; extern dictType objectKeyHeapPointerValueDictType; -extern dictType setDictType; +extern hashtableType setHashtableType; extern dictType BenchmarkDictType; extern dictType zsetDictType; extern hashtableType kvstoreKeysHashtableType; @@ -2681,6 +2684,7 @@ extern dictType objToDictDictType; extern hashtableType kvstoreChannelHashtableType; extern dictType modulesDictType; extern dictType sdsReplyDictType; +extern hashtableType sdsReplyHashtableType; extern dictType keylistDictType; extern dict *modules; @@ -3375,7 +3379,6 @@ void rejectCommandFormat(client *c, const char *fmt, ...); void *activeDefragAlloc(void *ptr); robj *activeDefragStringOb(robj *ob); void dismissSds(sds s); -void dismissMemory(void *ptr, size_t size_hint); void dismissMemoryInChild(void); #define RESTART_SERVER_NONE 0 diff --git a/src/t_set.c b/src/t_set.c index 997fa2f5c9..4279baf82f 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -28,6 +28,7 @@ */ #include "server.h" +#include "hashtable.h" #include "intset.h" /* Compact integer set structure */ /*----------------------------------------------------------------------------- @@ -50,7 +51,7 @@ robj *setTypeCreate(sds value, size_t size_hint) { /* We may oversize the set by using the hint if the hint is not accurate, * but we will assume this is acceptable to maximize performance. */ robj *o = createSetObject(); - dictExpand(o->ptr, size_hint); + hashtableExpand(o->ptr, size_hint); return o; } @@ -59,7 +60,7 @@ robj *setTypeCreate(sds value, size_t size_hint) { void setTypeMaybeConvert(robj *set, size_t size_hint) { if ((set->encoding == OBJ_ENCODING_LISTPACK && size_hint > server.set_max_listpack_entries) || (set->encoding == OBJ_ENCODING_INTSET && size_hint > server.set_max_intset_entries)) { - setTypeConvertAndExpand(set, OBJ_ENCODING_HT, size_hint, 1); + setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, size_hint, 1); } } @@ -74,7 +75,7 @@ static size_t intsetMaxEntries(void) { /* Converts intset to HT if it contains too many entries. */ static void maybeConvertIntset(robj *subject) { serverAssert(subject->encoding == OBJ_ENCODING_INTSET); - if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HT); + if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HASHTABLE); } /* When you know all set elements are integers, call this to convert the set to @@ -91,7 +92,7 @@ static void maybeConvertToIntset(robj *set) { while (setTypeNext(si, &str, &len, &llval) != -1) { if (str) { /* If the element is returned as a string, we may be able to convert - * it to integer. This happens for OBJ_ENCODING_HT. */ + * it to integer. This happens for OBJ_ENCODING_HASHTABLE. */ serverAssert(string2ll(str, len, (long long *)&llval)); } uint8_t success = 0; @@ -134,20 +135,21 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd } serverAssert(str); - if (set->encoding == OBJ_ENCODING_HT) { + if (set->encoding == OBJ_ENCODING_HASHTABLE) { /* Avoid duping the string if it is an sds string. */ sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len); - dict *ht = set->ptr; - void *position = dictFindPositionForInsert(ht, sdsval, NULL); - if (position) { + hashtable *ht = set->ptr; + hashtablePosition position; + if (hashtableFindPositionForInsert(ht, sdsval, &position, NULL)) { /* Key doesn't already exist in the set. Add it but dup the key. */ if (sdsval == str) sdsval = sdsdup(sdsval); - dictInsertAtPosition(ht, sdsval, position); + hashtableInsertAtPosition(ht, sdsval, &position); + return 1; } else if (sdsval != str) { /* String is already a member. Free our temporary sds copy. */ sdsfree(sdsval); + return 0; } - return (position != NULL); } else if (set->encoding == OBJ_ENCODING_LISTPACK) { unsigned char *lp = set->ptr; unsigned char *p = lpFirst(lp); @@ -166,8 +168,8 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd set->ptr = lp; } else { /* Size limit is reached. Convert to hashtable and add. */ - setTypeConvertAndExpand(set, OBJ_ENCODING_HT, lpLength(lp) + 1, 1); - serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK); + setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, lpLength(lp) + 1, 1); + serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len))); } return 1; } @@ -204,10 +206,10 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd set->ptr = lp; return 1; } else { - setTypeConvertAndExpand(set, OBJ_ENCODING_HT, intsetLen(set->ptr) + 1, 1); + setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, intsetLen(set->ptr) + 1, 1); /* The set *was* an intset and this value is not integer - * encodable, so dictAdd should always work. */ - serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK); + * encodable, so hashtableAdd should always work. */ + serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len))); return 1; } } @@ -242,9 +244,9 @@ int setTypeRemoveAux(robj *setobj, char *str, size_t len, int64_t llval, int str str_is_sds = 0; } - if (setobj->encoding == OBJ_ENCODING_HT) { + if (setobj->encoding == OBJ_ENCODING_HASHTABLE) { sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len); - int deleted = (dictDelete(setobj->ptr, sdsval) == DICT_OK); + int deleted = hashtableDelete(setobj->ptr, sdsval); if (sdsval != str) sdsfree(sdsval); /* free temp copy */ return deleted; } else if (setobj->encoding == OBJ_ENCODING_LISTPACK) { @@ -298,11 +300,11 @@ int setTypeIsMemberAux(robj *set, char *str, size_t len, int64_t llval, int str_ } else if (set->encoding == OBJ_ENCODING_INTSET) { long long llval; return string2ll(str, len, &llval) && intsetFind(set->ptr, llval); - } else if (set->encoding == OBJ_ENCODING_HT && str_is_sds) { - return dictFind(set->ptr, (sds)str) != NULL; - } else if (set->encoding == OBJ_ENCODING_HT) { + } else if (set->encoding == OBJ_ENCODING_HASHTABLE && str_is_sds) { + return hashtableFind(set->ptr, (sds)str, NULL); + } else if (set->encoding == OBJ_ENCODING_HASHTABLE) { sds sdsval = sdsnewlen(str, len); - int result = dictFind(set->ptr, sdsval) != NULL; + int result = hashtableFind(set->ptr, sdsval, NULL); sdsfree(sdsval); return result; } else { @@ -314,8 +316,8 @@ setTypeIterator *setTypeInitIterator(robj *subject) { setTypeIterator *si = zmalloc(sizeof(setTypeIterator)); si->subject = subject; si->encoding = subject->encoding; - if (si->encoding == OBJ_ENCODING_HT) { - si->di = dictGetIterator(subject->ptr); + if (si->encoding == OBJ_ENCODING_HASHTABLE) { + si->hashtable_iterator = hashtableCreateIterator(subject->ptr); } else if (si->encoding == OBJ_ENCODING_INTSET) { si->ii = 0; } else if (si->encoding == OBJ_ENCODING_LISTPACK) { @@ -327,7 +329,7 @@ setTypeIterator *setTypeInitIterator(robj *subject) { } void setTypeReleaseIterator(setTypeIterator *si) { - if (si->encoding == OBJ_ENCODING_HT) dictReleaseIterator(si->di); + if (si->encoding == OBJ_ENCODING_HASHTABLE) hashtableReleaseIterator(si->hashtable_iterator); zfree(si); } @@ -340,7 +342,7 @@ void setTypeReleaseIterator(setTypeIterator *si) { * (str and len) or (llele) depending on whether the value is stored as a string * or as an integer internally. * - * If OBJ_ENCODING_HT is returned, then str points to an sds string and can be + * If OBJ_ENCODING_HASHTABLE is returned, then str points to an sds string and can be * used as such. If OBJ_ENCODING_INTSET, then llele is populated and str is * pointed to NULL. If OBJ_ENCODING_LISTPACK is returned, the value can be * either a string or an integer. If *str is not NULL, then str and len are @@ -353,10 +355,10 @@ void setTypeReleaseIterator(setTypeIterator *si) { * * When there are no more elements -1 is returned. */ int setTypeNext(setTypeIterator *si, char **str, size_t *len, int64_t *llele) { - if (si->encoding == OBJ_ENCODING_HT) { - dictEntry *de = dictNext(si->di); - if (de == NULL) return -1; - *str = dictGetKey(de); + if (si->encoding == OBJ_ENCODING_HASHTABLE) { + void *next; + if (!hashtableNext(si->hashtable_iterator, &next)) return -1; + *str = next; *len = sdslen(*str); *llele = -123456789; /* Not needed. Defensive. */ } else if (si->encoding == OBJ_ENCODING_INTSET) { @@ -406,15 +408,16 @@ sds setTypeNextObject(setTypeIterator *si) { * object. The return value of the function is the object->encoding * field of the object and can be used by the caller to check if the * int64_t pointer or the str and len pointers were populated, as for - * setTypeNext. If OBJ_ENCODING_HT is returned, str is pointed to a + * setTypeNext. If OBJ_ENCODING_HASHTABLE is returned, str is pointed to a * string which is actually an sds string and it can be used as such. * * Note that both the str, len and llele pointers should be passed and cannot * be NULL. If str is set to NULL, the value is an integer stored in llele. */ int setTypeRandomElement(robj *setobj, char **str, size_t *len, int64_t *llele) { - if (setobj->encoding == OBJ_ENCODING_HT) { - dictEntry *de = dictGetFairRandomKey(setobj->ptr); - *str = dictGetKey(de); + if (setobj->encoding == OBJ_ENCODING_HASHTABLE) { + void *entry = NULL; + hashtableFairRandomEntry(setobj->ptr, &entry); + *str = entry; *len = sdslen(*str); *llele = -123456789; /* Not needed. Defensive. */ } else if (setobj->encoding == OBJ_ENCODING_INTSET) { @@ -457,14 +460,14 @@ robj *setTypePopRandom(robj *set) { obj = createStringObject(str, len); else obj = createStringObjectFromLongLong(llele); - setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT); + setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE); } return obj; } unsigned long setTypeSize(const robj *subject) { - if (subject->encoding == OBJ_ENCODING_HT) { - return dictSize((const dict *)subject->ptr); + if (subject->encoding == OBJ_ENCODING_HASHTABLE) { + return hashtableSize((const hashtable *)subject->ptr); } else if (subject->encoding == OBJ_ENCODING_INTSET) { return intsetLen((const intset *)subject->ptr); } else if (subject->encoding == OBJ_ENCODING_LISTPACK) { @@ -474,7 +477,7 @@ unsigned long setTypeSize(const robj *subject) { } } -/* Convert the set to specified encoding. The resulting dict (when converting +/* Convert the set to specified encoding. The resulting hashtable (when converting * to a hash table) is presized to hold the number of elements in the original * set. */ void setTypeConvert(robj *setobj, int enc) { @@ -489,28 +492,28 @@ int setTypeConvertAndExpand(robj *setobj, int enc, unsigned long cap, int panic) setTypeIterator *si; serverAssertWithInfo(NULL, setobj, setobj->type == OBJ_SET && setobj->encoding != enc); - if (enc == OBJ_ENCODING_HT) { - dict *d = dictCreate(&setDictType); + if (enc == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = hashtableCreate(&setHashtableType); sds element; - /* Presize the dict to avoid rehashing */ + /* Presize the hashtable to avoid rehashing */ if (panic) { - dictExpand(d, cap); - } else if (dictTryExpand(d, cap) != DICT_OK) { - dictRelease(d); + hashtableExpand(ht, cap); + } else if (!hashtableTryExpand(ht, cap)) { + hashtableRelease(ht); return C_ERR; } /* To add the elements we extract integers and create Objects */ si = setTypeInitIterator(setobj); while ((element = setTypeNextObject(si)) != NULL) { - serverAssert(dictAdd(d, element, NULL) == DICT_OK); + serverAssert(hashtableAdd(ht, element)); } setTypeReleaseIterator(si); freeSetObject(setobj); /* frees the internals but not setobj itself */ - setobj->encoding = OBJ_ENCODING_HT; - setobj->ptr = d; + setobj->encoding = OBJ_ENCODING_HASHTABLE; + setobj->ptr = ht; } else if (enc == OBJ_ENCODING_LISTPACK) { /* Preallocate the minimum two bytes per element (enc/value + backlen) */ size_t estcap = cap * 2; @@ -568,10 +571,10 @@ robj *setTypeDup(robj *o) { memcpy(new_lp, lp, sz); set = createObject(OBJ_SET, new_lp); set->encoding = OBJ_ENCODING_LISTPACK; - } else if (o->encoding == OBJ_ENCODING_HT) { + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { set = createSetObject(); - dict *d = o->ptr; - dictExpand(set->ptr, dictSize(d)); + hashtable *ht = o->ptr; + hashtableExpand(set->ptr, hashtableSize(ht)); si = setTypeInitIterator(o); char *str; size_t len; @@ -891,8 +894,8 @@ void spopWithCountCommand(client *c) { if (!newset) { newset = str ? createSetListpackObject() : createIntsetObject(); } - setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HT); - setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT); + setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE); + setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE); } } @@ -1001,8 +1004,6 @@ void srandmemberWithCountCommand(client *c) { size_t len; int64_t llele; - dict *d; - if (getRangeLongFromObjectOrReply(c, c->argv[2], -LONG_MAX, LONG_MAX, &l, NULL) != C_OK) return; if (l >= 0) { count = (unsigned long)l; @@ -1111,8 +1112,8 @@ void srandmemberWithCountCommand(client *c) { return; } - /* For CASE 3 and CASE 4 we need an auxiliary dictionary. */ - d = dictCreate(&sdsReplyDictType); + /* For CASE 3 and CASE 4 we need an auxiliary hashtable. */ + hashtable *ht = hashtableCreate(&sdsReplyHashtableType); /* CASE 3: * The number of elements inside the set is not greater than @@ -1126,29 +1127,25 @@ void srandmemberWithCountCommand(client *c) { if (count * SRANDMEMBER_SUB_STRATEGY_MUL > size) { setTypeIterator *si; - /* Add all the elements into the temporary dictionary. */ + /* Add all the elements into the temporary hashtable. */ si = setTypeInitIterator(set); - dictExpand(d, size); + hashtableExpand(ht, size); while (setTypeNext(si, &str, &len, &llele) != -1) { - int retval = DICT_ERR; - if (str == NULL) { - retval = dictAdd(d, sdsfromlonglong(llele), NULL); + serverAssert(hashtableAdd(ht, (void *)sdsfromlonglong(llele))); } else { - retval = dictAdd(d, sdsnewlen(str, len), NULL); + serverAssert(hashtableAdd(ht, (void *)sdsnewlen(str, len))); } - serverAssert(retval == DICT_OK); } setTypeReleaseIterator(si); - serverAssert(dictSize(d) == size); + serverAssert(hashtableSize(ht) == size); /* Remove random elements to reach the right count. */ while (size > count) { - dictEntry *de; - de = dictGetFairRandomKey(d); - dictUnlink(d, dictGetKey(de)); - sdsfree(dictGetKey(de)); - dictFreeUnlinkedEntry(d, de); + void *element; + hashtableFairRandomEntry(ht, &element); + hashtableDelete(ht, element); + sdsfree((sds)element); size--; } } @@ -1161,7 +1158,7 @@ void srandmemberWithCountCommand(client *c) { unsigned long added = 0; sds sdsele; - dictExpand(d, count); + hashtableExpand(ht, count); while (added < count) { setTypeRandomElement(set, &str, &len, &llele); if (str == NULL) { @@ -1172,7 +1169,7 @@ void srandmemberWithCountCommand(client *c) { /* Try to add the object to the dictionary. If it already exists * free it, otherwise increment the number of objects we have * in the result dictionary. */ - if (dictAdd(d, sdsele, NULL) == DICT_OK) + if (hashtableAdd(ht, sdsele)) added++; else sdsfree(sdsele); @@ -1181,14 +1178,15 @@ void srandmemberWithCountCommand(client *c) { /* CASE 3 & 4: send the result to the user. */ { - dictIterator *di; - dictEntry *de; + hashtableIterator iter; + hashtableInitIterator(&iter, ht); addReplyArrayLen(c, count); - di = dictGetIterator(d); - while ((de = dictNext(di)) != NULL) addReplyBulkSds(c, dictGetKey(de)); - dictReleaseIterator(di); - dictRelease(d); + serverAssert(count == hashtableSize(ht)); + void *element; + while (hashtableNext(&iter, &element)) addReplyBulkSds(c, (sds)element); + hashtableResetIterator(&iter); + hashtableRelease(ht); } } @@ -1336,7 +1334,7 @@ void sinterGenericCommand(client *c, while ((encoding = setTypeNext(si, &str, &len, &intobj)) != -1) { for (j = 1; j < setnum; j++) { if (sets[j] == sets[0]) continue; - if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HT)) break; + if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE)) break; } /* Only take action when all sets contain the member */ @@ -1355,7 +1353,7 @@ void sinterGenericCommand(client *c, } else { if (str && only_integers) { /* It may be an integer although we got it as a string. */ - if (encoding == OBJ_ENCODING_HT && string2ll(str, len, (long long *)&intobj)) { + if (encoding == OBJ_ENCODING_HASHTABLE && string2ll(str, len, (long long *)&intobj)) { if (dstset->encoding == OBJ_ENCODING_LISTPACK || dstset->encoding == OBJ_ENCODING_INTSET) { /* Adding it as an integer is more efficient. */ str = NULL; @@ -1365,7 +1363,7 @@ void sinterGenericCommand(client *c, only_integers = 0; } } - setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HT); + setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE); } } } @@ -1467,7 +1465,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke /* For a SET's encoding, according to the factory method setTypeCreate(), currently have 3 types: * 1. OBJ_ENCODING_INTSET * 2. OBJ_ENCODING_LISTPACK - * 3. OBJ_ENCODING_HT + * 3. OBJ_ENCODING_HASHTABLE * 'dstset_encoding' is used to determine which kind of encoding to use when initialize 'dstset'. * * If all sets are all OBJ_ENCODING_INTSET encoding or 'dstkey' is not null, keep 'dstset' @@ -1478,8 +1476,8 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke * the hashtable is more efficient when find and compare than the listpack. The corresponding * time complexity are O(1) vs O(n). */ if (!dstkey && dstset_encoding == OBJ_ENCODING_INTSET && - (setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HT)) { - dstset_encoding = OBJ_ENCODING_HT; + (setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HASHTABLE)) { + dstset_encoding = OBJ_ENCODING_HASHTABLE; } sets[j] = setobj; if (j > 0 && sets[0] == sets[j]) { @@ -1536,7 +1534,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke si = setTypeInitIterator(sets[j]); while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) { - cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT); + cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE); } setTypeReleaseIterator(si); } @@ -1556,11 +1554,11 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke for (j = 1; j < setnum; j++) { if (!sets[j]) continue; /* no key is an empty set. */ if (sets[j] == sets[0]) break; /* same set! */ - if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HT)) break; + if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HASHTABLE)) break; } if (j == setnum) { /* There is no other set with this element. Add it. */ - cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT); + cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE); } } setTypeReleaseIterator(si); @@ -1578,9 +1576,9 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke si = setTypeInitIterator(sets[j]); while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) { if (j == 0) { - cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT); + cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE); } else { - cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT); + cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE); } } setTypeReleaseIterator(si); diff --git a/src/t_zset.c b/src/t_zset.c index 105d57b7c3..e8c5a369b7 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -2069,9 +2069,7 @@ typedef struct { int ii; } is; struct { - dict *dict; - dictIterator *di; - dictEntry *de; + hashtableIterator *iter; } ht; struct { unsigned char *lp; @@ -2126,10 +2124,8 @@ void zuiInitIterator(zsetopsrc *op) { if (op->encoding == OBJ_ENCODING_INTSET) { it->is.is = op->subject->ptr; it->is.ii = 0; - } else if (op->encoding == OBJ_ENCODING_HT) { - it->ht.dict = op->subject->ptr; - it->ht.di = dictGetIterator(op->subject->ptr); - it->ht.de = dictNext(it->ht.di); + } else if (op->encoding == OBJ_ENCODING_HASHTABLE) { + it->ht.iter = hashtableCreateIterator(op->subject->ptr); } else if (op->encoding == OBJ_ENCODING_LISTPACK) { it->lp.lp = op->subject->ptr; it->lp.p = lpFirst(it->lp.lp); @@ -2166,8 +2162,8 @@ void zuiClearIterator(zsetopsrc *op) { iterset *it = &op->iter.set; if (op->encoding == OBJ_ENCODING_INTSET) { UNUSED(it); /* skip */ - } else if (op->encoding == OBJ_ENCODING_HT) { - dictReleaseIterator(it->ht.di); + } else if (op->encoding == OBJ_ENCODING_HASHTABLE) { + hashtableReleaseIterator(it->ht.iter); } else if (op->encoding == OBJ_ENCODING_LISTPACK) { UNUSED(it); } else { @@ -2235,13 +2231,11 @@ int zuiNext(zsetopsrc *op, zsetopval *val) { /* Move to next element. */ it->is.ii++; - } else if (op->encoding == OBJ_ENCODING_HT) { - if (it->ht.de == NULL) return 0; - val->ele = dictGetKey(it->ht.de); + } else if (op->encoding == OBJ_ENCODING_HASHTABLE) { + void *next; + if (!hashtableNext(it->ht.iter, &next)) return 0; + val->ele = next; val->score = 1.0; - - /* Move to next element. */ - it->ht.de = dictNext(it->ht.di); } else if (op->encoding == OBJ_ENCODING_LISTPACK) { if (it->lp.p == NULL) return 0; val->estr = lpGetValue(it->lp.p, &val->elen, &val->ell); diff --git a/src/zmalloc.c b/src/zmalloc.c index a696111e47..b1de4f2af1 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -451,15 +451,25 @@ void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) { zmalloc_oom_handler = oom_handler; } -/* Use 'MADV_DONTNEED' to release memory to operating system quickly. - * We do that in a fork child process to avoid CoW when the parent modifies - * these shared pages. */ -void zmadvise_dontneed(void *ptr) { +/* Try to release pages back to the OS directly using 'MADV_DONTNEED' (bypassing + * the allocator) in a fork child process to avoid CoW when the parent modifies + * those shared pages. For small allocations, we can't release any full page, + * so in an effort to avoid getting the size of the allocation from the + * allocator (malloc_size) when we already know it's small, we check the + * size_hint. If the size is not already known, passing a size_hint of 0 will + * lead the checking the real size of the allocation. + * Also please note that the size may be not accurate, so in order to make this + * solution effective, the judgement for releasing memory pages should not be + * too strict. */ +void zmadvise_dontneed(void *ptr, size_t size_hint) { #if defined(USE_JEMALLOC) && defined(__linux__) + if (ptr == NULL) return; + static size_t page_size = 0; if (page_size == 0) page_size = sysconf(_SC_PAGESIZE); size_t page_size_mask = page_size - 1; + if (size_hint && size_hint / 2 < page_size) return; size_t real_size = zmalloc_size(ptr); if (real_size < page_size) return; @@ -473,6 +483,7 @@ void zmadvise_dontneed(void *ptr) { } #else (void)(ptr); + (void)(size_hint); #endif } diff --git a/src/zmalloc.h b/src/zmalloc.h index 38c2bae864..68b4df63aa 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -139,7 +139,7 @@ size_t zmalloc_get_smap_bytes_by_field(char *field, long pid); size_t zmalloc_get_memory_size(void); void zlibc_free(void *ptr); void zlibc_trim(void); -void zmadvise_dontneed(void *ptr); +void zmadvise_dontneed(void *ptr, size_t size_hint); #ifndef HAVE_MALLOC_SIZE size_t zmalloc_size(void *ptr); diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index cf7f633a8c..e50faba62b 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -515,10 +515,10 @@ start_server {tags {"info" "external:skip"}} { set info_mem [r info memory] set mem_stats [r memory stats] assert_equal [getInfoProperty $info_mem mem_overhead_db_hashtable_rehashing] {0} - # overhead.db.hashtable.lut = memory overhead of hashset including hashset struct and tables - set hashset_overhead [dict get $mem_stats overhead.db.hashtable.lut] - if {$hashset_overhead < 140} { - # 32-bit version (hashset struct + 1 bucket of 64 bytes) + # overhead.db.hashtable.lut = memory overhead of hashtable including hashtable struct and tables + set hashtable_overhead [dict get $mem_stats overhead.db.hashtable.lut] + if {$hashtable_overhead < 140} { + # 32-bit version (hashtable struct + 1 bucket of 64 bytes) set bits 32 } else { set bits 64 diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl index 944c3d3d98..1871ec9b4d 100644 --- a/tests/unit/type/set.tcl +++ b/tests/unit/type/set.tcl @@ -33,6 +33,7 @@ start_server { assert_equal {0 1} [r smismember myset bla foo] assert_equal {0} [r smismember myset bla] assert_equal "bar $initelems($type)" [lsort [r smembers myset]] + r memory usage myset } } @@ -51,6 +52,7 @@ start_server { assert_equal {0 1} [r smismember myset 18 16] assert_equal {0} [r smismember myset 18] assert_equal {16 17} [lsort [r smembers myset]] + r memory usage myset } test {SMISMEMBER SMEMBERS SCARD against non set} { @@ -1029,111 +1031,6 @@ foreach type {single multiple single_multiple} { r srem $myset {*}$members } - proc verify_rehashing_completed_key {myset table_size keys} { - set htstats [r debug HTSTATS-KEY $myset] - assert {![string match {*rehashing target*} $htstats]} - return {[string match {*table size: $table_size*number of elements: $keys*} $htstats]} - } - - test "SRANDMEMBER with a dict containing long chain" { - set origin_save [config_get_set save ""] - set origin_max_lp [config_get_set set-max-listpack-entries 0] - set origin_save_delay [config_get_set rdb-key-save-delay 2147483647] - - # 1) Create a hash set with 100000 members. - set members {} - for {set i 0} {$i < 100000} {incr i} { - lappend members [format "m:%d" $i] - } - create_set myset $members - - # 2) Wait for the hash set rehashing to finish. - while {[is_rehashing myset]} { - r srandmember myset 100 - } - - # 3) Turn off the rehashing of this set, and remove the members to 500. - r bgsave - rem_hash_set_top_N myset [expr {[r scard myset] - 500}] - assert_equal [r scard myset] 500 - - # 4) Kill RDB child process to restart rehashing. - set pid1 [get_child_pid 0] - catch {exec kill -9 $pid1} - waitForBgsave r - - # 5) Let the set hash to start rehashing - r spop myset 1 - assert [is_rehashing myset] - - # 6) Verify that when rdb saving is in progress, rehashing will still be performed (because - # the ratio is extreme) by waiting for it to finish during an active bgsave. - r bgsave - - while {[is_rehashing myset]} { - r srandmember myset 1 - } - if {$::verbose} { - puts [r debug HTSTATS-KEY myset full] - } - - set pid1 [get_child_pid 0] - catch {exec kill -9 $pid1} - waitForBgsave r - - # 7) Check that eventually, SRANDMEMBER returns all elements. - array set allmyset {} - foreach ele [r smembers myset] { - set allmyset($ele) 1 - } - unset -nocomplain auxset - set iterations 1000 - while {$iterations != 0} { - incr iterations -1 - set res [r srandmember myset -10] - foreach ele $res { - set auxset($ele) 1 - } - if {[lsort [array names allmyset]] eq - [lsort [array names auxset]]} { - break; - } - } - assert {$iterations != 0} - - # 8) Remove the members to 30 in order to calculate the value of Chi-Square Distribution, - # otherwise we would need more iterations. - rem_hash_set_top_N myset [expr {[r scard myset] - 30}] - assert_equal [r scard myset] 30 - - # Hash set rehashing would be completed while removing members from the `myset` - # We also check the size and members in the hash table. - verify_rehashing_completed_key myset 64 30 - - # Now that we have a hash set with only one long chain bucket. - set htstats [r debug HTSTATS-KEY myset full] - assert {[regexp {different slots: ([0-9]+)} $htstats - different_slots]} - assert {[regexp {max chain length: ([0-9]+)} $htstats - max_chain_length]} - assert {$different_slots == 1 && $max_chain_length == 30} - - # 9) Use positive count (PATH 4) to get 10 elements (out of 30) each time. - unset -nocomplain allkey - set iterations 1000 - while {$iterations != 0} { - incr iterations -1 - set res [r srandmember myset 10] - foreach ele $res { - lappend allkey $ele - } - } - # validate even distribution of random sampling (df = 29, 73 means 0.00001 probability) - assert_lessthan [chi_square_value $allkey] 73 - - r config set save $origin_save - r config set set-max-listpack-entries $origin_max_lp - r config set rdb-key-save-delay $origin_save_delay - } {OK} {needs:debug slow} - proc setup_move {} { r del myset3{t} myset4{t} create_set myset1{t} {1 a b} From aa35b89456d871c9c0292dcba07d9452540e1aad Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 15 Dec 2024 12:09:53 +0800 Subject: [PATCH 015/101] Automatic failover vote is not limited by two times the node timeout (#1356) This is a follow of #1305, we now decided to apply the same change to automatic failover as well, that is, move forward with removing it for both automatic and manual failovers. Quote from Ping during the review: Note that we already debounce transient primary failures with node timeout, ensuring failover is only triggered after sustained outages. Election timing is naturally staggered by replica spacing, making the likelihood of simultaneous elections from replicas of the same shard very low. The one-vote-per-epoch rule further throttles retries and ensures orderly elections. On top of that, quorum-based primary failure confirmation, cluster-state convergence, and slot ownership validation are all built into the process. Quote from Madelyn during the review: It against the specific primary. It's to prevent double failovers. If a primary just took over we don't want someone else to try to take over and give the new primary some amount of time to take over. I have not seen this issue though, it might have been over optimizing? The double failure mode, where a node fails and then another node fails within the nodetimeout also doesn't seem that common either though. So the conclusion is that we all agreed to remove it completely, it will make the code a lot simpler. And if there is other specific edge cases we are missing, we will fix it in other way. See discussion #1305 for more information. Signed-off-by: Binbin --- src/cluster_legacy.c | 19 ------------- src/cluster_legacy.h | 2 -- tests/unit/cluster/manual-failover.tcl | 39 ++++++++++++++++++++------ 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index d1c6dd0094..418070f69c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1505,7 +1505,6 @@ clusterNode *createClusterNode(char *nodename, int flags) { node->cport = 0; node->tls_port = 0; node->fail_reports = listCreate(); - node->voted_time = 0; node->orphaned_time = 0; node->repl_offset_time = 0; node->repl_offset = 0; @@ -4396,23 +4395,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { return; } - /* We did not voted for a replica about this primary for two - * times the node timeout. This is not strictly needed for correctness - * of the algorithm but makes the base case more linear. - * - * This limitation does not restrict manual failover. If a user initiates - * a manual failover, we need to allow it to vote, otherwise the manual - * failover may time out. */ - if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): " - "can't vote for any replica of %.40s (%s) within %lld milliseconds", - node->name, node->human_nodename, - node->replicaof->name, node->replicaof->human_nodename, - (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time))); - return; - } - /* The replica requesting the vote must have a configEpoch for the claimed * slots that is >= the one of the primaries currently serving the same * slots in the current configuration. */ @@ -4434,7 +4416,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We can vote for this replica. */ server.cluster->lastVoteEpoch = server.cluster->currentEpoch; - if (!force_ack) node->replicaof->voted_time = mstime(); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); clusterSendFailoverAuth(node); serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename, diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index fb317038d6..d3e1c3459e 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -341,8 +341,6 @@ struct _clusterNode { mstime_t pong_received; /* Unix time we received the pong */ mstime_t data_received; /* Unix time we received any data */ mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a replica of this primary in non manual - * failover scenarios. */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned primary condition */ mstime_t inbound_link_freed_time; /* Last time we freed the inbound link for this node. diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 220ffc3eaf..dbcbb26380 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -189,11 +189,6 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6 set CLUSTER_PACKET_TYPE_NONE -1 - # Setting a large timeout to make sure we hit the voted_time limit. - R 0 config set cluster-node-timeout 150000 - R 1 config set cluster-node-timeout 150000 - R 2 config set cluster-node-timeout 150000 - # Let replica drop FAILOVER_AUTH_ACK so that the election won't # get the enough votes and the election will time out. R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK @@ -229,10 +224,6 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval pause_process [srv 0 pid] wait_for_cluster_state fail - # Setting a large timeout to make sure we hit the voted_time limit. - R 1 config set cluster-node-timeout 150000 - R 2 config set cluster-node-timeout 150000 - # R 3 performs an automatic failover and it will work. R 3 config set cluster-replica-no-failover no wait_for_condition 1000 50 { @@ -272,6 +263,36 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval } } ;# start_cluster +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Automatic failover vote is not limited by two times the node timeout - mixed failover" { + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The first failover does not happen" + } + wait_for_cluster_propagation + + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + + # Let R 3 trigger the automatic failover + pause_process [srv 0 pid] + wait_for_condition 1000 50 { + [s -3 role] eq {master} + } else { + fail "The third failover does not happen" + } + } +} ;# start_cluster + start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { test "Manual failover will reset the on-going election" { set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST 5 From 4b7e07f84be96dc1a85b14b7a014b5a7e253752f Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 16 Dec 2024 13:43:48 +0800 Subject: [PATCH 016/101] Drop the MEET packet if the link node is in handshake state (#1436) After #1307 got merged, we notice there is a assert happen in setClusterNodeToInboundClusterLink: ``` === ASSERTION FAILED === ==> '!link->node' is not true ``` In #778, we will call setClusterNodeToInboundClusterLink to attach the node to the link during the MEET processing, so if we receive a another MEET packet in a short time, the node is still in handshake state, we will meet this assert and crash the server. If the link is bound to a node and the node is in the handshake state, and we receive a MEET packet, it may be that the sender sent multiple MEET packets so in here we are dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert will happen if the other sends a MEET packet because it detects that there is no inbound link, this node creates a new node in HANDSHAKE state (with a random node name), and respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET flag. This node is supposed to open an outbound connection to the other node in the next cron cycle, but before this happens, the other node re-sends a MEET on the same link because it still detects no inbound connection. Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET packet eliminate the handshake state. Signed-off-by: Binbin --- src/cluster_legacy.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 418070f69c..9ddcf6678d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3003,7 +3003,8 @@ int clusterIsValidPacket(clusterLink *link) { } if (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2) { - serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); + serverLog(LL_WARNING, "Dropping packet of type %s that matches debug drop filter", + clusterGetMessageTypeString(type)); return 0; } @@ -3094,7 +3095,7 @@ int clusterProcessPacket(clusterLink *link) { if (server.debug_cluster_close_link_on_packet_drop && (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2)) { freeClusterLink(link); - serverLog(LL_WARNING, "Closing link for matching packet type %hu", type); + serverLog(LL_WARNING, "Closing link for matching packet type %s", clusterGetMessageTypeString(type)); return 0; } return 1; @@ -3110,8 +3111,8 @@ int clusterProcessPacket(clusterLink *link) { freeClusterLink(link); serverLog( LL_NOTICE, - "Closing link for node that sent a lightweight message of type %hu as its first message on the link", - type); + "Closing link for node that sent a lightweight message of type %s as its first message on the link", + clusterGetMessageTypeString(type)); return 0; } clusterNode *sender = link->node; @@ -3120,6 +3121,27 @@ int clusterProcessPacket(clusterLink *link) { return 1; } + if (type == CLUSTERMSG_TYPE_MEET && link->node && nodeInHandshake(link->node)) { + /* If the link is bound to a node and the node is in the handshake state, and we receive + * a MEET packet, it may be that the sender sent multiple MEET packets so in here we are + * dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert + * will happen if the other sends a MEET packet because it detects that there is no inbound + * link, this node creates a new node in HANDSHAKE state (with a random node name), and + * respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET + * flag. This node is supposed to open an outbound connection to the other node in the next + * cron cycle, but before this happens, the other node re-sends a MEET on the same link + * because it still detects no inbound connection. We improved the re-send logic of MEET in + * #1441, now we will only re-send MEET packet once every handshake timeout period. + * + * Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name + * and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent + * us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET + * packet eliminate the handshake state. */ + serverLog(LL_NOTICE, "Dropping MEET packet from node %.40s because the node is already in handshake state", + link->node->name); + return 1; + } + uint16_t flags = ntohs(hdr->flags); uint64_t sender_claimed_current_epoch = 0, sender_claimed_config_epoch = 0; clusterNode *sender = getNodeFromLinkAndMsg(link, hdr); From d76ca02c3a019ccc994a7316605c0def46d75721 Mon Sep 17 00:00:00 2001 From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:01:34 -0800 Subject: [PATCH 017/101] Fix the secrete for test bucket. (#1447) We have set the secret as `AWS_S3_TEST_BUCKET` for test bucket and I missed it in the initial review. Signed-off-by: Roshan Khatri --- .github/workflows/build-release-packages.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml index 6c54971bcd..3f1ca2627b 100644 --- a/.github/workflows/build-release-packages.yml +++ b/.github/workflows/build-release-packages.yml @@ -59,8 +59,10 @@ jobs: id: check-if-testing run: | if [[ "${{ github.event_name }}" == "push" ]]; then + echo "This is a test workflow -> We will upload to the Test S3 Bucket" echo "IS_TEST=true" >> $GITHUB_OUTPUT else + echo "This is a Release workflow -> We will upload to the Release S3 Bucket" echo "IS_TEST=false" >> $GITHUB_OUTPUT fi shell: bash @@ -92,7 +94,7 @@ jobs: build_matrix: ${{ needs.generate-build-matrix.outputs.x86_64-build-matrix }} region: us-west-2 secrets: - bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }} + bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }} role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} release-build-linux-arm-packages: @@ -106,5 +108,5 @@ jobs: build_matrix: ${{ needs.generate-build-matrix.outputs.arm64-build-matrix }} region: us-west-2 secrets: - bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }} + bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }} role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} From bc4f865acac3579fccac5d97128eee90c17d3be8 Mon Sep 17 00:00:00 2001 From: xbasel <103044017+xbasel@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:04:27 +0200 Subject: [PATCH 018/101] Fix test_reclaimFilePageCache to avoid tmpfs (#1379) Avoid tmpfs as fadvise(FADV_DONTNEED) has no effect on memory-backed filesystems. Fixes https://github.com/valkey-io/valkey/issues/897 --------- Signed-off-by: Ran Shidlansik Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: Ran Shidlansik --- src/unit/test_util.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/unit/test_util.c b/src/unit/test_util.c index 4558c38c3b..9858318e06 100644 --- a/src/unit/test_util.c +++ b/src/unit/test_util.c @@ -6,6 +6,11 @@ #include "../util.h" #include "test_help.h" +#if defined(__linux__) +#include +#include +#endif + int test_string2ll(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); @@ -291,6 +296,15 @@ int test_reclaimFilePageCache(int argc, char **argv, int flags) { if (flags & UNIT_TEST_VALGRIND) return 0; #if defined(__linux__) + struct statfs stats; + + /* Check if /tmp is memory-backed (e.g., tmpfs) */ + if (statfs("/tmp", &stats) == 0) { + if (stats.f_type != TMPFS_MAGIC) { // Not tmpfs, use /tmp + return 0; + } + } + char *tmpfile = "/tmp/redis-reclaim-cache-test"; int fd = open(tmpfile, O_RDWR | O_CREAT, 0644); TEST_ASSERT(fd >= 0); From 5c414bf784d1319e7d4f2705a77cc582642db36a Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:07:55 +0200 Subject: [PATCH 019/101] Introduce FORCE_DEFRAG compilation option to allow activedefrag run when allocator is not jemalloc (#1303) Introduce compile time option to force activedefrag to run even when jemalloc is not used as the allocator. This is in order to be able to run tests with defrag enabled while using memory instrumentation tools. fixes: https://github.com/valkey-io/valkey/issues/1241 --------- Signed-off-by: ranshid Signed-off-by: Ran Shidlansik Signed-off-by: Madelyn Olson Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: Madelyn Olson --- .github/workflows/daily.yml | 46 +++++++++++++++++++++++++++++ CMakeLists.txt | 1 + deps/CMakeLists.txt | 4 ++- src/CMakeLists.txt | 6 ++++ src/Makefile | 5 ++++ src/allocator_defrag.c | 59 ++++++++++++++++++++++++++++++++++--- src/allocator_defrag.h | 10 ++++--- src/config.c | 2 +- src/defrag.c | 28 ------------------ src/server.h | 5 ++++ tests/support/server.tcl | 5 ++++ tests/test_helper.tcl | 4 +++ tests/unit/info.tcl | 2 +- 13 files changed, 138 insertions(+), 39 deletions(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index c06d73440d..44386f5ffd 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -689,6 +689,52 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate + test-sanitizer-force-defrag: + runs-on: ubuntu-latest + if: | + (github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || + (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && + !contains(github.event.inputs.skipjobs, 'sanitizer') + timeout-minutes: 14400 + strategy: + fail-fast: false + steps: + - name: prep + if: github.event_name == 'workflow_dispatch' + run: | + echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV + echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV + echo "skipjobs: ${{github.event.inputs.skipjobs}}" + echo "skiptests: ${{github.event.inputs.skiptests}}" + echo "test_args: ${{github.event.inputs.test_args}}" + echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + repository: ${{ env.GITHUB_REPOSITORY }} + ref: ${{ env.GITHUB_HEAD_REF }} + - name: make + run: make all-with-unit-tests OPT=-O3 SANITIZER=address DEBUG_FORCE_DEFRAG=yes USE_JEMALLOC=no SERVER_CFLAGS='-Werror' + - name: testprep + run: | + sudo apt-get update + sudo apt-get install tcl8.6 tclx -y + - name: test + if: true && !contains(github.event.inputs.skiptests, 'valkey') + run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} + - name: module api test + if: true && !contains(github.event.inputs.skiptests, 'modules') + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + - name: sentinel tests + if: true && !contains(github.event.inputs.skiptests, 'sentinel') + run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} + - name: cluster tests + if: true && !contains(github.event.inputs.skiptests, 'cluster') + run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} + - name: unittest + if: true && !contains(github.event.inputs.skiptests, 'unittest') + run: ./src/valkey-unit-tests + test-rpm-distros-jemalloc: if: | (github.event_name == 'workflow_dispatch' || diff --git a/CMakeLists.txt b/CMakeLists.txt index 77d0c4e7d8..55b18cb994 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,3 +41,4 @@ unset(BUILD_UNIT_TESTS CACHE) unset(BUILD_TEST_MODULES CACHE) unset(BUILD_EXAMPLE_MODULES CACHE) unset(USE_TLS CACHE) +unset(DEBUG_FORCE_DEFRAG CACHE) diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt index c904b94031..3f5b04dc22 100644 --- a/deps/CMakeLists.txt +++ b/deps/CMakeLists.txt @@ -1,4 +1,6 @@ -add_subdirectory(jemalloc) +if (USE_JEMALLOC) + add_subdirectory(jemalloc) +endif () add_subdirectory(lua) # Set hiredis options. We need to disable the defaults set in the OPTION(..) we do this by setting them in the CACHE diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b87dff3db0..90d7e25cf4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -22,6 +22,12 @@ if (VALKEY_RELEASE_BUILD) set_property(TARGET valkey-server PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) endif () +if (DEBUG_FORCE_DEFRAG) + message(STATUS "Forcing Active Defrag run on valkey-server") + target_compile_definitions(valkey-server PRIVATE DEBUG_FORCE_DEFRAG) + target_compile_definitions(valkey-server PRIVATE HAVE_DEFRAG) +endif () + if (BUILD_SANITIZER) # 'BUILD_SANITIZER' is defined in ValkeySetup module (based on user input) # If defined, the variables 'VALKEY_SANITAIZER_CFLAGS' and 'VALKEY_SANITAIZER_LDFLAGS' diff --git a/src/Makefile b/src/Makefile index 8552deb3d9..e52f4f08d3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -130,6 +130,11 @@ ifdef REDIS_LDFLAGS SERVER_LDFLAGS := $(REDIS_LDFLAGS) endif +# Special case of forcing defrag to run even though we have no Jemlloc support +ifeq ($(DEBUG_FORCE_DEFRAG), yes) + SERVER_CFLAGS +=-DHAVE_DEFRAG -DDEBUG_FORCE_DEFRAG +endif + FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS) FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm diff --git a/src/allocator_defrag.c b/src/allocator_defrag.c index b2330c95e0..5e805b3044 100644 --- a/src/allocator_defrag.c +++ b/src/allocator_defrag.c @@ -43,12 +43,10 @@ * the other component to ensure both are using the same allocator configuration. */ -#include +#include "server.h" #include "serverassert.h" #include "allocator_defrag.h" -#define UNUSED(x) (void)(x) - #if defined(HAVE_DEFRAG) && defined(USE_JEMALLOC) #define STRINGIFY_(x) #x @@ -402,8 +400,56 @@ int allocatorShouldDefrag(void *ptr) { je_cb.bin_info[binind].nregs - SLAB_NFREE(out, 0)); } -#else +/* Utility function to get the fragmentation ratio from jemalloc. + * It is critical to do that by comparing only heap maps that belong to + * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this + * fragmentation ratio in order to decide if a defrag action should be taken + * or not, a false detection can cause the defragmenter to waste a lot of CPU + * without the possibility of getting any results. */ +float getAllocatorFragmentation(size_t *out_frag_bytes) { + size_t resident, active, allocated, frag_smallbins_bytes; + zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL); + frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); + /* Calculate the fragmentation ratio as the proportion of wasted memory in small + * bins (which are defraggable) relative to the total allocated memory (including large bins). + * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, + * despite the fact it's not a lot of memory for the user. */ + float frag_pct = (float)frag_smallbins_bytes / allocated * 100; + float rss_pct = ((float)resident / allocated) * 100 - 100; + size_t rss_bytes = resident - allocated; + if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes; + serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", + allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes); + return frag_pct; +} +#elif defined(DEBUG_FORCE_DEFRAG) +int allocatorDefragInit(void) { + return 0; +} +void allocatorDefragFree(void *ptr, size_t size) { + UNUSED(size); + zfree(ptr); +} +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size) { + return zmalloc(size); + return NULL; +} +unsigned long allocatorDefragGetFragSmallbins(void) { + return 0; +} + +int allocatorShouldDefrag(void *ptr) { + UNUSED(ptr); + return 1; +} + +float getAllocatorFragmentation(size_t *out_frag_bytes) { + *out_frag_bytes = server.active_defrag_ignore_bytes + 1; + return server.active_defrag_threshold_upper; +} + +#else int allocatorDefragInit(void) { return -1; } @@ -423,4 +469,9 @@ int allocatorShouldDefrag(void *ptr) { UNUSED(ptr); return 0; } + +float getAllocatorFragmentation(size_t *out_frag_bytes) { + UNUSED(out_frag_bytes); + return 0; +} #endif diff --git a/src/allocator_defrag.h b/src/allocator_defrag.h index 7fb56208b6..7947bef72c 100644 --- a/src/allocator_defrag.h +++ b/src/allocator_defrag.h @@ -5,10 +5,11 @@ #include /* We can enable the server defrag capabilities only if we are using Jemalloc * and the version that has the experimental.utilization namespace in mallctl . */ -#if defined(JEMALLOC_VERSION_MAJOR) && \ - (JEMALLOC_VERSION_MAJOR > 5 || \ - (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \ - (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1)) +#if (defined(JEMALLOC_VERSION_MAJOR) && \ + (JEMALLOC_VERSION_MAJOR > 5 || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1))) || \ + defined(DEBUG_FORCE_DEFRAG) #define HAVE_DEFRAG #endif #endif @@ -18,5 +19,6 @@ void allocatorDefragFree(void *ptr, size_t size); __attribute__((malloc)) void *allocatorDefragAlloc(size_t size); unsigned long allocatorDefragGetFragSmallbins(void); int allocatorShouldDefrag(void *ptr); +float getAllocatorFragmentation(size_t *out_frag_bytes); #endif /* __ALLOCATOR_DEFRAG_H */ diff --git a/src/config.c b/src/config.c index bcfa465e1f..f08b79ebbd 100644 --- a/src/config.c +++ b/src/config.c @@ -3186,7 +3186,7 @@ standardConfig static_configs[] = { createBoolConfig("replica-read-only", "slave-read-only", DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_replica_ro, 1, NULL, NULL), createBoolConfig("replica-ignore-maxmemory", "slave-ignore-maxmemory", MODIFIABLE_CONFIG, server.repl_replica_ignore_maxmemory, 1, NULL, NULL), createBoolConfig("jemalloc-bg-thread", NULL, MODIFIABLE_CONFIG, server.jemalloc_bg_thread, 1, NULL, updateJemallocBgThread), - createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, 0, isValidActiveDefrag, NULL), + createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, CONFIG_ACTIVE_DEFRAG_DEFAULT, isValidActiveDefrag, NULL), createBoolConfig("syslog-enabled", NULL, IMMUTABLE_CONFIG, server.syslog_enabled, 0, NULL, NULL), createBoolConfig("cluster-enabled", NULL, IMMUTABLE_CONFIG, server.cluster_enabled, 0, NULL, NULL), createBoolConfig("appendonly", NULL, MODIFIABLE_CONFIG | DENY_LOADING_CONFIG, server.aof_enabled, 0, NULL, updateAppendonly), diff --git a/src/defrag.c b/src/defrag.c index 8e7fc8449e..6522d9aa7b 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -149,11 +149,6 @@ static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper static list *defrag_later; static unsigned long defrag_later_cursor; - -/* this method was added to jemalloc in order to help us understand which - * pointers are worthwhile moving and which aren't */ -int je_get_defrag_hint(void *ptr); - /* Defrag function which allocates and copies memory if needed, but DOESN'T free the old block. * It is the responsibility of the caller to free the old block if a non-NULL value (new block) * is returned. (Returns NULL if no relocation was needed.) @@ -824,29 +819,6 @@ static void dbKeysScanCallback(void *privdata, void *elemref) { server.stat_active_defrag_scanned++; } -/* Utility function to get the fragmentation ratio from jemalloc. - * It is critical to do that by comparing only heap maps that belong to - * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this - * fragmentation ratio in order to decide if a defrag action should be taken - * or not, a false detection can cause the defragmenter to waste a lot of CPU - * without the possibility of getting any results. */ -static float getAllocatorFragmentation(size_t *out_frag_bytes) { - size_t resident, active, allocated, frag_smallbins_bytes; - zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL); - frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); - /* Calculate the fragmentation ratio as the proportion of wasted memory in small - * bins (which are defraggable) relative to the total allocated memory (including large bins). - * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, - * despite the fact it's not a lot of memory for the user. */ - float frag_pct = (float)frag_smallbins_bytes / allocated * 100; - float rss_pct = ((float)resident / allocated) * 100 - 100; - size_t rss_bytes = resident - allocated; - if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes; - serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", - allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes); - return frag_pct; -} - /* Defrag scan callback for a pubsub channels hashtable. */ static void defragPubsubScanCallback(void *privdata, void *elemref) { defragPubSubCtx *ctx = privdata; diff --git a/src/server.h b/src/server.h index b07144de92..eb19d18c8d 100644 --- a/src/server.h +++ b/src/server.h @@ -148,6 +148,11 @@ struct hdr_histogram; #define DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE 60 /* Grace period in seconds for replica main \ * channel to establish psync. */ #define LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT 100 /* Default: 0.1 seconds */ +#if !defined(DEBUG_FORCE_DEFRAG) +#define CONFIG_ACTIVE_DEFRAG_DEFAULT 0 +#else +#define CONFIG_ACTIVE_DEFRAG_DEFAULT 1 +#endif /* Bucket sizes for client eviction pools. Each bucket stores clients with * memory usage of up to twice the size of the bucket below it. */ diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 7257339042..8c545d900a 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -221,6 +221,11 @@ proc tags_acceptable {tags err_return} { return 0 } + if {$::debug_defrag && [lsearch $tags "debug_defrag:skip"] >= 0} { + set err "Not supported on server compiled with DEBUG_FORCE_DEFRAG option" + return 0 + } + if {$::singledb && [lsearch $tags "singledb:skip"] >= 0} { set err "Not supported on singledb" return 0 diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 1f0658071a..8a4125e48d 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -92,6 +92,7 @@ set ::large_memory 0 set ::log_req_res 0 set ::force_resp3 0 set ::solo_tests_count 0 +set ::debug_defrag 0 # Set to 1 when we are running in client mode. The server test uses a # server-client model to run tests simultaneously. The server instance @@ -607,6 +608,7 @@ proc print_help_screen {} { "--ignore-encoding Don't validate object encoding." "--ignore-digest Don't use debug digest validations." "--large-memory Run tests using over 100mb." + "--debug-defrag Indicate the test is running against server compiled with DEBUG_FORCE_DEFRAG option" "--help Print this help screen." } "\n"] } @@ -748,6 +750,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} { set ::ignoreencoding 1 } elseif {$opt eq {--ignore-digest}} { set ::ignoredigest 1 + } elseif {$opt eq {--debug-defrag}} { + set ::debug_defrag 1 } elseif {$opt eq {--help}} { print_help_screen exit 0 diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index e50faba62b..a27043fa88 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -10,7 +10,7 @@ proc latency_percentiles_usec {cmd} { return [latencyrstat_percentiles $cmd r] } -start_server {tags {"info" "external:skip"}} { +start_server {tags {"info" "external:skip" "debug_defrag:skip"}} { start_server {} { test {latencystats: disable/enable} { From 1e20853898d05d7caef842bfbe84223c4e54f267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Wed, 18 Dec 2024 02:45:43 +0100 Subject: [PATCH 020/101] Discover and fix new typos (#1446) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upgrade `typos` and fix corresponding typos --------- Signed-off-by: Viktor Szépe --- .github/workflows/spell-check.yml | 2 +- src/geohash_helper.c | 2 +- src/server.c | 4 +- src/server.h | 2 +- src/zmalloc.c | 2 +- tests/integration/aof-multi-part.tcl | 84 +++++++++---------- tests/integration/aof.tcl | 4 +- .../integration/dual-channel-replication.tcl | 12 +-- tests/support/aofmanifest.tcl | 4 +- tests/support/test.tcl | 4 +- 10 files changed, 60 insertions(+), 60 deletions(-) diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml index 69d9b9cb6a..14db670b24 100644 --- a/.github/workflows/spell-check.yml +++ b/.github/workflows/spell-check.yml @@ -26,7 +26,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Install typos - uses: taiki-e/install-action@cd5df4de2e75f3b819ba55f780f7bb8cd4a05a41 # v2.32.2 + uses: taiki-e/install-action@fe9759bf4432218c779595708e80a1aadc85cedc # v2.46.10 with: tool: typos diff --git a/src/geohash_helper.c b/src/geohash_helper.c index aa4b4743a6..c05c2f2634 100644 --- a/src/geohash_helper.c +++ b/src/geohash_helper.c @@ -48,7 +48,7 @@ /// @brief The usual PI/180 constant const double DEG_TO_RAD = 0.017453292519943295769236907684886; -/// @brief Earth's quatratic mean radius for WGS-84 +/// @brief Earth's quadratic mean radius for WGS-84 const double EARTH_RADIUS_IN_METERS = 6372797.560856; const double MERCATOR_MAX = 20037726.37; diff --git a/src/server.c b/src/server.c index da06884eb1..db39970632 100644 --- a/src/server.c +++ b/src/server.c @@ -1702,7 +1702,7 @@ static void sendGetackToReplicas(void) { robj *argv[3]; argv[0] = shared.replconf; argv[1] = shared.getack; - argv[2] = shared.special_asterick; /* Not used argument. */ + argv[2] = shared.special_asterisk; /* Not used argument. */ replicationFeedReplicas(-1, argv, 3); } @@ -2088,7 +2088,7 @@ void createSharedObjects(void) { shared.load = createStringObject("LOAD", 4); shared.createconsumer = createStringObject("CREATECONSUMER", 14); shared.getack = createStringObject("GETACK", 6); - shared.special_asterick = createStringObject("*", 1); + shared.special_asterisk = createStringObject("*", 1); shared.special_equals = createStringObject("=", 1); shared.redacted = makeObjectShared(createStringObject("(redacted)", 10)); diff --git a/src/server.h b/src/server.h index eb19d18c8d..b0e4ae1050 100644 --- a/src/server.h +++ b/src/server.h @@ -1444,7 +1444,7 @@ struct sharedObjectsStruct { *rpoplpush, *lmove, *blmove, *zpopmin, *zpopmax, *emptyscan, *multi, *exec, *left, *right, *hset, *srem, *xgroup, *xclaim, *script, *replconf, *eval, *persist, *set, *pexpireat, *pexpire, *time, *pxat, *absttl, *retrycount, *force, *justid, *entriesread, *lastid, *ping, *setid, *keepttl, *load, *createconsumer, *getack, - *special_asterick, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk, + *special_asterisk, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk, *smessagebulk, *select[PROTO_SHARED_SELECT_CMDS], *integers[OBJ_SHARED_INTEGERS], *mbulkhdr[OBJ_SHARED_BULKHDR_LEN], /* "*\r\n" */ *bulkhdr[OBJ_SHARED_BULKHDR_LEN], /* "$\r\n" */ diff --git a/src/zmalloc.c b/src/zmalloc.c index b1de4f2af1..3abf9a31a0 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -762,7 +762,7 @@ void zlibc_trim(void) { /* For proc_pidinfo() used later in zmalloc_get_smap_bytes_by_field(). * Note that this file cannot be included in zmalloc.h because it includes * a Darwin queue.h file where there is a "LIST_HEAD" macro (!) defined - * conficting with user code. */ + * conflicting with user code. */ #include #endif diff --git a/tests/integration/aof-multi-part.tcl b/tests/integration/aof-multi-part.tcl index 5c4f24b7d4..9a23031c08 100644 --- a/tests/integration/aof-multi-part.tcl +++ b/tests/integration/aof-multi-part.tcl @@ -4,11 +4,11 @@ set server_path [tmpdir server.multi.aof] set aof_dirname "appendonlydir" set aof_basename "appendonly.aof" set aof_dirpath "$server_path/$aof_dirname" -set aof_base1_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_sufix$::aof_format_suffix" -set aof_base2_file "$server_path/$aof_dirname/${aof_basename}.2$::base_aof_sufix$::aof_format_suffix" -set aof_incr1_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_sufix$::aof_format_suffix" -set aof_incr2_file "$server_path/$aof_dirname/${aof_basename}.2$::incr_aof_sufix$::aof_format_suffix" -set aof_incr3_file "$server_path/$aof_dirname/${aof_basename}.3$::incr_aof_sufix$::aof_format_suffix" +set aof_base1_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_suffix$::aof_format_suffix" +set aof_base2_file "$server_path/$aof_dirname/${aof_basename}.2$::base_aof_suffix$::aof_format_suffix" +set aof_incr1_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_suffix$::aof_format_suffix" +set aof_incr2_file "$server_path/$aof_dirname/${aof_basename}.2$::incr_aof_suffix$::aof_format_suffix" +set aof_incr3_file "$server_path/$aof_dirname/${aof_basename}.3$::incr_aof_suffix$::aof_format_suffix" set aof_manifest_file "$server_path/$aof_dirname/${aof_basename}$::manifest_suffix" set aof_old_name_old_path "$server_path/$aof_basename" set aof_old_name_new_path "$aof_dirpath/$aof_basename" @@ -705,7 +705,7 @@ tags {"external:skip"} { set client [valkey [srv host] [srv port] 0 $::tls] wait_done_loading $client - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] assert_aof_manifest_content $aof_manifest_file { {file appendonly.aof.1.base.rdb seq 1 type b} @@ -728,7 +728,7 @@ tags {"external:skip"} { set client [valkey [srv host] [srv port] 0 $::tls] wait_done_loading $client - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::aof_format_suffix}"] assert_aof_manifest_content $aof_manifest_file { {file appendonly.aof.1.base.aof seq 1 type b} @@ -750,7 +750,7 @@ tags {"external:skip"} { start_server_aof [list dir $server_path aof-use-rdb-preamble no] { wait_done_loading r - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::aof_format_suffix}"] assert_aof_manifest_content $aof_manifest_file { {file appendonly.aof.1.base.aof seq 1 type b} @@ -827,8 +827,8 @@ tags {"external:skip"} { # Check we really have these files assert_equal 1 [check_file_exist $aof_dirpath $aof_manifest_name] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] r bgrewriteaof waitForBgrewriteaof r @@ -842,13 +842,13 @@ tags {"external:skip"} { assert_equal 1 [check_file_exist $aof_dirpath $aof_manifest_name] # Wait bio delete history wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] stop_write_load $load_handle0 wait_load_handlers_disconnected @@ -901,11 +901,11 @@ tags {"external:skip"} { {file appendonly.aof.5.incr.aof seq 5 type i} } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_sufix}${::aof_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_sufix}${::aof_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_suffix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_suffix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_suffix}${::aof_format_suffix}"] stop_write_load $load_handle0 wait_load_handlers_disconnected @@ -936,17 +936,17 @@ tags {"external:skip"} { # Wait bio delete history wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_sufix}${::aof_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_suffix}${::aof_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_suffix}${::aof_format_suffix}"] set d1 [r debug digest] r debug loadaof @@ -965,10 +965,10 @@ tags {"external:skip"} { {file appendonly.aof.4.base.rdb seq 4 type b} } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_sufix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_suffix}${::rdb_format_suffix}"] wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.7${::incr_aof_sufix}${::aof_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.7${::incr_aof_suffix}${::aof_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } @@ -990,13 +990,13 @@ tags {"external:skip"} { # Wait bio delete history wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_sufix}${::rdb_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_suffix}${::rdb_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] } test "AOF enable/disable auto gc" { @@ -1018,10 +1018,10 @@ tags {"external:skip"} { {file appendonly.aof.3.incr.aof seq 3 type i} } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] r config set aof-disable-auto-gc no @@ -1033,10 +1033,10 @@ tags {"external:skip"} { # wait bio delete history wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_sufix}${::rdb_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_suffix}${::rdb_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } @@ -1192,7 +1192,7 @@ tags {"external:skip"} { waitForBgrewriteaof r # Can create New INCR AOF - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.10${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.10${::incr_aof_suffix}${::aof_format_suffix}"] assert_aof_manifest_content $aof_manifest_file { {file appendonly.aof.11.base.rdb seq 11 type b} @@ -1248,7 +1248,7 @@ tags {"external:skip"} { # Make sure manifest file is not created assert_equal 0 [check_file_exist $aof_dirpath $aof_manifest_name] # Make sure BASE AOF is not created - assert_equal 0 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] + assert_equal 0 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] # Make sure the next AOFRW has started wait_for_condition 1000 50 { diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index 33c7c12d4b..3a666bbd15 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -4,8 +4,8 @@ set server_path [tmpdir server.aof] set aof_dirname "appendonlydir" set aof_basename "appendonly.aof" set aof_dirpath "$server_path/$aof_dirname" -set aof_base_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_sufix$::aof_format_suffix" -set aof_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_sufix$::aof_format_suffix" +set aof_base_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_suffix$::aof_format_suffix" +set aof_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_suffix$::aof_format_suffix" set aof_manifest_file "$server_path/$aof_dirname/$aof_basename$::manifest_suffix" tags {"aof external:skip"} { diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index e417dad6c9..8191b9f699 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -355,8 +355,8 @@ start_server {tags {"dual-channel-replication external:skip"}} { verify_replica_online $primary 0 500 verify_replica_online $primary 1 500 - wait_for_value_to_propegate_to_replica $primary $replica1 "key1" - wait_for_value_to_propegate_to_replica $primary $replica2 "key1" + wait_for_value_to_propagate_to_replica $primary $replica1 "key1" + wait_for_value_to_propagate_to_replica $primary $replica2 "key1" assert {[s 0 total_forks] eq "1" } } @@ -374,8 +374,8 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica2 replicaof $primary_host $primary_port verify_replica_online $primary 0 500 verify_replica_online $primary 1 500 - wait_for_value_to_propegate_to_replica $primary $replica1 "key2" - wait_for_value_to_propegate_to_replica $primary $replica2 "key2" + wait_for_value_to_propagate_to_replica $primary $replica1 "key2" + wait_for_value_to_propagate_to_replica $primary $replica2 "key2" wait_for_condition 50 1000 { [status $replica1 master_link_status] == "up" } else { @@ -444,7 +444,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Replica is not synced" } - wait_for_value_to_propegate_to_replica $primary $replica1 "key3" + wait_for_value_to_propagate_to_replica $primary $replica1 "key3" # Verify that we did not use dual-channel-replication sync assert {[status $primary sync_partial_ok] == $cur_psync} @@ -483,7 +483,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Replica is not synced" } - wait_for_value_to_propegate_to_replica $primary $replica "key1" + wait_for_value_to_propagate_to_replica $primary $replica "key1" # Confirm the occurrence of a race condition. wait_for_log_messages -1 {"* Psync established after rdb load*"} 0 2000 1 } diff --git a/tests/support/aofmanifest.tcl b/tests/support/aofmanifest.tcl index 308d1172aa..fc20bacc99 100644 --- a/tests/support/aofmanifest.tcl +++ b/tests/support/aofmanifest.tcl @@ -1,5 +1,5 @@ -set ::base_aof_sufix ".base" -set ::incr_aof_sufix ".incr" +set ::base_aof_suffix ".base" +set ::incr_aof_suffix ".incr" set ::manifest_suffix ".manifest" set ::aof_format_suffix ".aof" set ::rdb_format_suffix ".rdb" diff --git a/tests/support/test.tcl b/tests/support/test.tcl index 262dc66041..3fd74d0387 100644 --- a/tests/support/test.tcl +++ b/tests/support/test.tcl @@ -160,12 +160,12 @@ proc verify_replica_online {master replica_idx max_retry} { } } -proc wait_for_value_to_propegate_to_replica {master replica key} { +proc wait_for_value_to_propagate_to_replica {master replica key} { set val [$master get $key] wait_for_condition 50 500 { ([$replica get $key] eq $val) } else { - error "Key $key did not propegate. Expected $val but got [$replica get $key]" + error "Key $key did not propagate. Expected $val but got [$replica get $key]" } } From cbe08dd0f29a5d27027a9b21695c83e5e1cc9972 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Tue, 17 Dec 2024 17:48:53 -0800 Subject: [PATCH 021/101] Fix undefined behavior defined by ASAN (#1451) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Asan now supports making sure you are passing in the correct pointer type, which seems useful but we can't support it since we pass in an incorrect pointer in several places. This is most commonly done with generic free functions, where we simply cast it to the correct type. It's not a lot of code to clean up, so it seems appropriate to cleanup instead of disabling the check. --------- Signed-off-by: Madelyn Olson Co-authored-by: Viktor Söderqvist --- src/acl.c | 20 ++++++++++---------- src/adlist.c | 6 ++++++ src/adlist.h | 1 + src/call_reply.c | 2 +- src/db.c | 2 +- src/defrag.c | 2 +- src/eval.c | 4 ++-- src/functions.c | 2 +- src/listpack.c | 6 ++++++ src/listpack.h | 1 + src/module.c | 2 +- src/networking.c | 2 +- src/replication.c | 2 +- src/t_stream.c | 19 +++++++++++++++---- src/unit/test_listpack.c | 2 +- src/unit/test_ziplist.c | 2 +- 16 files changed, 50 insertions(+), 25 deletions(-) diff --git a/src/acl.c b/src/acl.c index cfcf102887..d1f970a805 100644 --- a/src/acl.c +++ b/src/acl.c @@ -297,11 +297,6 @@ int ACLListMatchSds(void *a, void *b) { return sdscmp(a, b) == 0; } -/* Method to free list elements from ACL users password/patterns lists. */ -void ACLListFreeSds(void *item) { - sdsfree(item); -} - /* Method to duplicate list elements from ACL users password/patterns lists. */ void *ACLListDupSds(void *item) { return sdsdup(item); @@ -374,7 +369,7 @@ aclSelector *ACLCreateSelector(int flags) { listSetFreeMethod(selector->patterns, ACLListFreeKeyPattern); listSetDupMethod(selector->patterns, ACLListDupKeyPattern); listSetMatchMethod(selector->channels, ACLListMatchSds); - listSetFreeMethod(selector->channels, ACLListFreeSds); + listSetFreeMethod(selector->channels, sdsfreeVoid); listSetDupMethod(selector->channels, ACLListDupSds); memset(selector->allowed_commands, 0, sizeof(selector->allowed_commands)); @@ -445,7 +440,7 @@ user *ACLCreateUser(const char *name, size_t namelen) { u->passwords = listCreate(); u->acl_string = NULL; listSetMatchMethod(u->passwords, ACLListMatchSds); - listSetFreeMethod(u->passwords, ACLListFreeSds); + listSetFreeMethod(u->passwords, sdsfreeVoid); listSetDupMethod(u->passwords, ACLListDupSds); u->selectors = listCreate(); @@ -489,6 +484,11 @@ void ACLFreeUser(user *u) { zfree(u); } +/* Used for generic free functions. */ +static void ACLFreeUserVoid(void *u) { + ACLFreeUser(u); +} + /* When a user is deleted we need to cycle the active * connections in order to kill all the pending ones that * are authenticated with such user. */ @@ -2445,12 +2445,12 @@ sds ACLLoadFromFile(const char *filename) { c->user = new_user; } - if (user_channels) raxFreeWithCallback(user_channels, (void (*)(void *))listRelease); - raxFreeWithCallback(old_users, (void (*)(void *))ACLFreeUser); + if (user_channels) raxFreeWithCallback(user_channels, listReleaseVoid); + raxFreeWithCallback(old_users, ACLFreeUserVoid); sdsfree(errors); return NULL; } else { - raxFreeWithCallback(Users, (void (*)(void *))ACLFreeUser); + raxFreeWithCallback(Users, ACLFreeUserVoid); Users = old_users; errors = sdscat(errors, "WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); diff --git a/src/adlist.c b/src/adlist.c index 11b152592b..0dc77cc038 100644 --- a/src/adlist.c +++ b/src/adlist.c @@ -77,6 +77,12 @@ void listRelease(list *list) { zfree(list); } +/* Just like listRelease, but takes the list as a (void *). + * Useful as generic free callback. */ +void listReleaseVoid(void *l) { + listRelease((list *)l); +} + /* Add a new node to the list, to head, containing the specified 'value' * pointer as value. * diff --git a/src/adlist.h b/src/adlist.h index bfc4280434..c642c1c791 100644 --- a/src/adlist.h +++ b/src/adlist.h @@ -72,6 +72,7 @@ typedef struct list { /* Prototypes */ list *listCreate(void); void listRelease(list *list); +void listReleaseVoid(void *list); void listEmpty(list *list); list *listAddNodeHead(list *list, void *value); list *listAddNodeTail(list *list, void *value); diff --git a/src/call_reply.c b/src/call_reply.c index 00d196081e..dc981b8be8 100644 --- a/src/call_reply.c +++ b/src/call_reply.c @@ -559,7 +559,7 @@ CallReply *callReplyCreateError(sds reply, void *private_data) { sdsfree(reply); } list *deferred_error_list = listCreate(); - listSetFreeMethod(deferred_error_list, (void (*)(void *))sdsfree); + listSetFreeMethod(deferred_error_list, sdsfreeVoid); listAddNodeTail(deferred_error_list, sdsnew(err_buff)); return callReplyCreate(err_buff, deferred_error_list, private_data); } diff --git a/src/db.c b/src/db.c index 1223d00c8d..e31d7e7f7f 100644 --- a/src/db.c +++ b/src/db.c @@ -1193,7 +1193,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * are deep copied temporary strings. We must not free them if they are just * a shallow copy - a pointer to the actual data in the data structure */ if (!shallow_copied_list_items) { - listSetFreeMethod(keys, (void (*)(void *))sdsfree); + listSetFreeMethod(keys, sdsfreeVoid); } /* For main hash table scan or scannable data structure. */ diff --git a/src/defrag.c b/src/defrag.c index 6522d9aa7b..e9f40d4fab 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -421,7 +421,7 @@ static void activeDefragQuickListNodes(quicklist *ql) { static void defragLater(robj *obj) { if (!defrag_later) { defrag_later = listCreate(); - listSetFreeMethod(defrag_later, (void (*)(void *))sdsfree); + listSetFreeMethod(defrag_later, sdsfreeVoid); defrag_later_cursor = 0; } sds key = sdsdup(objectGetKey(obj)); diff --git a/src/eval.c b/src/eval.c index a9c50cdf90..e9fac531f5 100644 --- a/src/eval.c +++ b/src/eval.c @@ -204,7 +204,7 @@ void scriptingInit(int setup) { * and we need to free them respectively. */ lctx.lua_scripts = dictCreate(&shaScriptObjectDictType); lctx.lua_scripts_lru_list = listCreate(); - listSetFreeMethod(lctx.lua_scripts_lru_list, (void (*)(void *))sdsfree); + listSetFreeMethod(lctx.lua_scripts_lru_list, sdsfreeVoid); lctx.lua_scripts_mem = 0; luaRegisterServerAPI(lua); @@ -777,7 +777,7 @@ void ldbInit(void) { ldb.conn = NULL; ldb.active = 0; ldb.logs = listCreate(); - listSetFreeMethod(ldb.logs, (void (*)(void *))sdsfree); + listSetFreeMethod(ldb.logs, sdsfreeVoid); ldb.children = listCreate(); ldb.src = NULL; ldb.lines = 0; diff --git a/src/functions.c b/src/functions.c index b694e35252..feb82d4ab7 100644 --- a/src/functions.c +++ b/src/functions.c @@ -348,7 +348,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l } else { if (!old_libraries_list) { old_libraries_list = listCreate(); - listSetFreeMethod(old_libraries_list, (void (*)(void *))engineLibraryFree); + listSetFreeMethod(old_libraries_list, engineLibraryDispose); } libraryUnlink(functions_lib_ctx_dst, old_li); listAddNodeTail(old_libraries_list, old_li); diff --git a/src/listpack.c b/src/listpack.c index 2dfb321f56..76c2f9ea38 100644 --- a/src/listpack.c +++ b/src/listpack.c @@ -250,6 +250,12 @@ void lpFree(unsigned char *lp) { lp_free(lp); } +/* Same as lpFree, but useful for when you are passing the listpack + * into a generic free function that expects (void *) */ +void lpFreeVoid(void *lp) { + lp_free((unsigned char *)lp); +} + /* Shrink the memory to fit. */ unsigned char *lpShrinkToFit(unsigned char *lp) { size_t size = lpGetTotalBytes(lp); diff --git a/src/listpack.h b/src/listpack.h index aa7636143f..b143797261 100644 --- a/src/listpack.h +++ b/src/listpack.h @@ -56,6 +56,7 @@ typedef struct { unsigned char *lpNew(size_t capacity); void lpFree(unsigned char *lp); +void lpFreeVoid(void *lp); unsigned char *lpShrinkToFit(unsigned char *lp); unsigned char * lpInsertString(unsigned char *lp, unsigned char *s, uint32_t slen, unsigned char *p, int where, unsigned char **newp); diff --git a/src/module.c b/src/module.c index 36283e2c73..541ae490ab 100644 --- a/src/module.c +++ b/src/module.c @@ -10399,7 +10399,7 @@ ValkeyModuleServerInfoData *VM_GetServerInfo(ValkeyModuleCtx *ctx, const char *s * context instead of passing NULL. */ void VM_FreeServerInfo(ValkeyModuleCtx *ctx, ValkeyModuleServerInfoData *data) { if (ctx != NULL) autoMemoryFreed(ctx, VALKEYMODULE_AM_INFO, data); - raxFreeWithCallback(data->rax, (void (*)(void *))sdsfree); + raxFreeWithCallback(data->rax, sdsfreeVoid); zfree(data); } diff --git a/src/networking.c b/src/networking.c index 4d386d6dc4..16147ff0ba 100644 --- a/src/networking.c +++ b/src/networking.c @@ -556,7 +556,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { if (c->flag.module) { if (!c->deferred_reply_errors) { c->deferred_reply_errors = listCreate(); - listSetFreeMethod(c->deferred_reply_errors, (void (*)(void *))sdsfree); + listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid); } listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len)); return; diff --git a/src/replication.c b/src/replication.c index b5ce77f5e0..3a207a1d0f 100644 --- a/src/replication.c +++ b/src/replication.c @@ -282,7 +282,7 @@ void removeReplicaFromPsyncWait(client *replica_main_client) { void resetReplicationBuffer(void) { server.repl_buffer_mem = 0; server.repl_buffer_blocks = listCreate(); - listSetFreeMethod(server.repl_buffer_blocks, (void (*)(void *))zfree); + listSetFreeMethod(server.repl_buffer_blocks, zfree); } int canFeedReplicaReplBuffer(client *replica) { diff --git a/src/t_stream.c b/src/t_stream.c index 79aa080703..17254b58dd 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -54,6 +54,7 @@ #define STREAM_LISTPACK_MAX_SIZE (1 << 30) void streamFreeCG(streamCG *cg); +void streamFreeCGVoid(void *cg); void streamFreeNACK(streamNACK *na); size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, @@ -86,8 +87,8 @@ stream *streamNew(void) { /* Free a stream, including the listpacks stored inside the radix tree. */ void freeStream(stream *s) { - raxFreeWithCallback(s->rax, (void (*)(void *))lpFree); - if (s->cgroups) raxFreeWithCallback(s->cgroups, (void (*)(void *))streamFreeCG); + raxFreeWithCallback(s->rax, lpFreeVoid); + if (s->cgroups) raxFreeWithCallback(s->cgroups, streamFreeCGVoid); zfree(s); } @@ -2454,6 +2455,11 @@ void streamFreeConsumer(streamConsumer *sc) { zfree(sc); } +/* Used for generic free functions. */ +static void streamFreeConsumerVoid(void *sc) { + streamFreeConsumer((streamConsumer *)sc); +} + /* Create a new consumer group in the context of the stream 's', having the * specified name, last server ID and reads counter. If a consumer group with * the same name already exists NULL is returned, otherwise the pointer to the @@ -2473,11 +2479,16 @@ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, lo /* Free a consumer group and all its associated data. */ void streamFreeCG(streamCG *cg) { - raxFreeWithCallback(cg->pel, (void (*)(void *))streamFreeNACK); - raxFreeWithCallback(cg->consumers, (void (*)(void *))streamFreeConsumer); + raxFreeWithCallback(cg->pel, zfree); + raxFreeWithCallback(cg->consumers, streamFreeConsumerVoid); zfree(cg); } +/* Used for generic free functions. */ +void streamFreeCGVoid(void *cg) { + streamFreeCG((streamCG *)cg); +} + /* Lookup the consumer group in the specified stream and returns its * pointer, otherwise if there is no such group, NULL is returned. */ streamCG *streamLookupCG(stream *s, sds groupname) { diff --git a/src/unit/test_listpack.c b/src/unit/test_listpack.c index 4838fc8952..0c71da18db 100644 --- a/src/unit/test_listpack.c +++ b/src/unit/test_listpack.c @@ -1184,7 +1184,7 @@ int test_listpackStressWithRandom(int argc, char **argv, int flags) { for (i = 0; i < iteration; i++) { lp = lpNew(0); ref = listCreate(); - listSetFreeMethod(ref, (void (*)(void *))sdsfree); + listSetFreeMethod(ref, sdsfreeVoid); len = rand() % 256; /* Create lists */ diff --git a/src/unit/test_ziplist.c b/src/unit/test_ziplist.c index d2f7ebe69c..58687d81fc 100644 --- a/src/unit/test_ziplist.c +++ b/src/unit/test_ziplist.c @@ -645,7 +645,7 @@ int test_ziplistStressWithRandomPayloadsOfDifferentEncoding(int argc, char **arg for (i = 0; i < iteration; i++) { zl = ziplistNew(); ref = listCreate(); - listSetFreeMethod(ref, (void (*)(void *))sdsfree); + listSetFreeMethod(ref, sdsfreeVoid); len = rand() % 256; /* Create lists */ From c72089932c75c9ad6722f17e62bc6299ab142721 Mon Sep 17 00:00:00 2001 From: uriyage <78144248+uriyage@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:03:30 +0200 Subject: [PATCH 022/101] Offload TLS negotiation to I/O threads (#1338) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## TLS Negotiation Offloading to I/O Threads ### Overview This PR introduces the ability to offload TLS handshake negotiations to I/O threads, significantly improving performance under high TLS connection loads. ### Key Changes - Added infrastructure to offload TLS negotiations to I/O threads - Refactored SSL event handling to allow I/O threads modify conn flags. - Introduced new connection flag to identify client connections ### Performance Impact Testing with 650 clients with SET commands and 160 new TLS connections per second in the background: #### Throughput Impact of new TLS connections - **With Offloading**: Minimal impact (1050K → 990K ops/sec) - **Without Offloading**: Significant drop (1050K → 670K ops/sec) #### New Connection Rate - **With Offloading**: - 1,757 conn/sec - **Without Offloading**: - 477 conn/sec ### Implementation Details 1. **Main Thread**: - Initiates negotiation-offload jobs to I/O threads - Adds connections to pending-read clients list (using existing read offload mechanism) - Post-negotiation handling: - Creates read/write events if needed for incomplete negotiations - Calls accept handler for completed negotiations 2. **I/O Thread**: - Performs TLS negotiation - Updates connection flags based on negotiation result Related issue:https://github.com/valkey-io/valkey/issues/761 --------- Signed-off-by: Uri Yagelnik Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: Madelyn Olson --- .github/workflows/daily.yml | 38 ++++++++++ src/connection.h | 5 +- src/io_threads.c | 52 ++++++++++++++ src/io_threads.h | 1 + src/networking.c | 6 ++ src/server.c | 2 + src/server.h | 1 + src/tls.c | 139 ++++++++++++++++++------------------ 8 files changed, 174 insertions(+), 70 deletions(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 44386f5ffd..e1d577b51b 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -375,6 +375,44 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}} + test-ubuntu-tls-io-threads: + runs-on: ubuntu-latest + if: | + (github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || + (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && + !contains(github.event.inputs.skipjobs, 'tls') && !contains(github.event.inputs.skipjobs, 'iothreads') + timeout-minutes: 14400 + steps: + - name: prep + if: github.event_name == 'workflow_dispatch' + run: | + echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV + echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV + echo "skipjobs: ${{github.event.inputs.skipjobs}}" + echo "skiptests: ${{github.event.inputs.skiptests}}" + echo "test_args: ${{github.event.inputs.test_args}}" + echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + repository: ${{ env.GITHUB_REPOSITORY }} + ref: ${{ env.GITHUB_HEAD_REF }} + - name: make + run: | + make BUILD_TLS=yes SERVER_CFLAGS='-Werror' + - name: testprep + run: | + sudo apt-get install tcl8.6 tclx tcl-tls + ./utils/gen-test-certs.sh + - name: test + if: true && !contains(github.event.inputs.skiptests, 'valkey') + run: | + ./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}} + - name: cluster tests + if: true && !contains(github.event.inputs.skiptests, 'cluster') + run: | + ./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}} + test-ubuntu-reclaim-cache: runs-on: ubuntu-latest if: | diff --git a/src/connection.h b/src/connection.h index 8a2775ee34..fd7e0910cf 100644 --- a/src/connection.h +++ b/src/connection.h @@ -54,8 +54,9 @@ typedef enum { CONN_STATE_ERROR } ConnectionState; -#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0) /* Closed scheduled by a handler */ -#define CONN_FLAG_WRITE_BARRIER (1 << 1) /* Write barrier requested */ +#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0) /* Closed scheduled by a handler */ +#define CONN_FLAG_WRITE_BARRIER (1 << 1) /* Write barrier requested */ +#define CONN_FLAG_ALLOW_ACCEPT_OFFLOAD (1 << 2) /* Connection accept can be offloaded to IO threads. */ #define CONN_TYPE_SOCKET "tcp" #define CONN_TYPE_UNIX "unix" diff --git a/src/io_threads.c b/src/io_threads.c index 3865eb77c3..90f5b88700 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -561,3 +561,55 @@ void trySendPollJobToIOThreads(void) { aeSetPollProtect(server.el, 1); IOJobQueue_push(jq, IOThreadPoll, server.el); } + +static void ioThreadAccept(void *data) { + client *c = (client *)data; + connAccept(c->conn, NULL); + c->io_read_state = CLIENT_COMPLETED_IO; +} + +/* + * Attempts to offload an Accept operation (currently used for TLS accept) for a client + * connection to I/O threads. + * + * Returns: + * C_OK - If the accept operation was successfully queued for processing + * C_ERR - If the connection is not eligible for offloading + * + * Parameters: + * conn - The connection object to perform the accept operation on + */ +int trySendAcceptToIOThreads(connection *conn) { + if (server.io_threads_num <= 1) { + return C_ERR; + } + + if (!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD)) { + return C_ERR; + } + + client *c = connGetPrivateData(conn); + if (c->io_read_state != CLIENT_IDLE) { + return C_OK; + } + + if (server.active_io_threads_num <= 1) { + return C_ERR; + } + + size_t thread_id = (c->id % (server.active_io_threads_num - 1)) + 1; + IOJobQueue *job_queue = &io_jobs[thread_id]; + + if (IOJobQueue_isFull(job_queue)) { + return C_ERR; + } + + c->io_read_state = CLIENT_PENDING_IO; + c->flag.pending_read = 1; + listLinkNodeTail(server.clients_pending_io_read, &c->pending_read_list_node); + connSetPostponeUpdateState(c->conn, 1); + server.stat_io_accept_offloaded++; + IOJobQueue_push(job_queue, ioThreadAccept, c); + + return C_OK; +} diff --git a/src/io_threads.h b/src/io_threads.h index 8818f08588..a3ff582a77 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -13,5 +13,6 @@ int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv); void adjustIOThreadsByEventLoad(int numevents, int increase_only); void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); +int trySendAcceptToIOThreads(connection *conn); #endif /* IO_THREADS_H */ diff --git a/src/networking.c b/src/networking.c index 16147ff0ba..9f36f24275 100644 --- a/src/networking.c +++ b/src/networking.c @@ -134,6 +134,7 @@ client *createClient(connection *conn) { if (server.tcpkeepalive) connKeepAlive(conn, server.tcpkeepalive); connSetReadHandler(conn, readQueryFromClient); connSetPrivateData(conn, c); + conn->flags |= CONN_FLAG_ALLOW_ACCEPT_OFFLOAD; } c->buf = zmalloc_usable(PROTO_REPLY_CHUNK_BYTES, &c->buf_usable_size); selectDb(c, 0); @@ -4805,9 +4806,14 @@ int processIOThreadsReadDone(void) { processed++; server.stat_io_reads_processed++; + /* Save the current conn state, as connUpdateState may modify it */ + int in_accept_state = (connGetState(c->conn) == CONN_STATE_ACCEPTING); connSetPostponeUpdateState(c->conn, 0); connUpdateState(c->conn); + /* In accept state, no client's data was read - stop here. */ + if (in_accept_state) continue; + /* On read error - stop here. */ if (handleReadResult(c) == C_ERR) { continue; diff --git a/src/server.c b/src/server.c index db39970632..a0c642b541 100644 --- a/src/server.c +++ b/src/server.c @@ -2645,6 +2645,7 @@ void resetServerStats(void) { server.stat_total_reads_processed = 0; server.stat_io_writes_processed = 0; server.stat_io_freed_objects = 0; + server.stat_io_accept_offloaded = 0; server.stat_poll_processed_by_io_threads = 0; server.stat_total_writes_processed = 0; server.stat_client_qbuf_limit_disconnections = 0; @@ -5922,6 +5923,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "io_threaded_reads_processed:%lld\r\n", server.stat_io_reads_processed, "io_threaded_writes_processed:%lld\r\n", server.stat_io_writes_processed, "io_threaded_freed_objects:%lld\r\n", server.stat_io_freed_objects, + "io_threaded_accept_processed:%lld\r\n", server.stat_io_accept_offloaded, "io_threaded_poll_processed:%lld\r\n", server.stat_poll_processed_by_io_threads, "io_threaded_total_prefetch_batches:%lld\r\n", server.stat_total_prefetch_batches, "io_threaded_total_prefetch_entries:%lld\r\n", server.stat_total_prefetch_entries, diff --git a/src/server.h b/src/server.h index b0e4ae1050..d8497ccff5 100644 --- a/src/server.h +++ b/src/server.h @@ -1869,6 +1869,7 @@ struct valkeyServer { long long stat_io_reads_processed; /* Number of read events processed by IO threads */ long long stat_io_writes_processed; /* Number of write events processed by IO threads */ long long stat_io_freed_objects; /* Number of objects freed by IO threads */ + long long stat_io_accept_offloaded; /* Number of offloaded accepts */ long long stat_poll_processed_by_io_threads; /* Total number of poll jobs processed by IO */ long long stat_total_reads_processed; /* Total number of read events processed */ long long stat_total_writes_processed; /* Total number of write events processed */ diff --git a/src/tls.c b/src/tls.c index 48b75553de..11e6143561 100644 --- a/src/tls.c +++ b/src/tls.c @@ -32,6 +32,7 @@ #include "server.h" #include "connhelpers.h" #include "adlist.h" +#include "io_threads.h" #if (USE_OPENSSL == 1 /* BUILD_YES */) || ((USE_OPENSSL == 2 /* BUILD_MODULE */) && (BUILD_TLS_MODULE == 2)) @@ -437,16 +438,13 @@ static ConnectionType CT_TLS; * */ -typedef enum { - WANT_READ = 1, - WANT_WRITE -} WantIOType; - #define TLS_CONN_FLAG_READ_WANT_WRITE (1 << 0) #define TLS_CONN_FLAG_WRITE_WANT_READ (1 << 1) #define TLS_CONN_FLAG_FD_SET (1 << 2) #define TLS_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 3) #define TLS_CONN_FLAG_HAS_PENDING (1 << 4) +#define TLS_CONN_FLAG_ACCEPT_ERROR (1 << 5) +#define TLS_CONN_FLAG_ACCEPT_SUCCESS (1 << 6) typedef struct tls_connection { connection c; @@ -514,20 +512,26 @@ static connection *connCreateAcceptedTLS(int fd, void *priv) { return (connection *)conn; } +static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler); static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask); static void updateSSLEvent(tls_connection *conn); +static void clearTLSWantFlags(tls_connection *conn) { + conn->flags &= ~(TLS_CONN_FLAG_WRITE_WANT_READ | TLS_CONN_FLAG_READ_WANT_WRITE); +} + /* Process the return code received from OpenSSL> - * Update the want parameter with expected I/O. + * Update the conn flags with the WANT_READ/WANT_WRITE flags. * Update the connection's error state if a real error has occurred. * Returns an SSL error code, or 0 if no further handling is required. */ -static int handleSSLReturnCode(tls_connection *conn, int ret_value, WantIOType *want) { +static int handleSSLReturnCode(tls_connection *conn, int ret_value) { + clearTLSWantFlags(conn); if (ret_value <= 0) { int ssl_err = SSL_get_error(conn->ssl, ret_value); switch (ssl_err) { - case SSL_ERROR_WANT_WRITE: *want = WANT_WRITE; return 0; - case SSL_ERROR_WANT_READ: *want = WANT_READ; return 0; + case SSL_ERROR_WANT_WRITE: conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE; return 0; + case SSL_ERROR_WANT_READ: conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ; return 0; case SSL_ERROR_SYSCALL: conn->c.last_errno = errno; if (conn->ssl_error) zfree(conn->ssl_error); @@ -563,11 +567,8 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update } if (ret_value <= 0) { - WantIOType want = 0; int ssl_err; - if (!(ssl_err = handleSSLReturnCode(conn, ret_value, &want))) { - if (want == WANT_READ) conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ; - if (want == WANT_WRITE) conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE; + if (!(ssl_err = handleSSLReturnCode(conn, ret_value))) { if (update_event) updateSSLEvent(conn); errno = EAGAIN; return -1; @@ -585,19 +586,17 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update return ret_value; } -static void registerSSLEvent(tls_connection *conn, WantIOType want) { +static void registerSSLEvent(tls_connection *conn) { int mask = aeGetFileEvents(server.el, conn->c.fd); - switch (want) { - case WANT_READ: + if (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ) { if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); - break; - case WANT_WRITE: + } else if (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE) { if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); - break; - default: serverAssert(0); break; + } else { + serverAssert(0); } } @@ -650,12 +649,47 @@ static void updateSSLEvent(tls_connection *conn) { if (!need_write && (mask & AE_WRITABLE)) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); } +static int TLSHandleAcceptResult(tls_connection *conn, int call_handler_on_error) { + serverAssert(conn->c.state == CONN_STATE_ACCEPTING); + if (conn->flags & TLS_CONN_FLAG_ACCEPT_SUCCESS) { + conn->c.state = CONN_STATE_CONNECTED; + } else if (conn->flags & TLS_CONN_FLAG_ACCEPT_ERROR) { + conn->c.state = CONN_STATE_ERROR; + if (!call_handler_on_error) return C_ERR; + } else { + /* Still pending accept */ + registerSSLEvent(conn); + return C_OK; + } + + /* call accept handler */ + if (!callHandler((connection *)conn, conn->c.conn_handler)) return C_ERR; + conn->c.conn_handler = NULL; + return C_OK; +} + static void updateSSLState(connection *conn_) { tls_connection *conn = (tls_connection *)conn_; + + if (conn->c.state == CONN_STATE_ACCEPTING) { + if (TLSHandleAcceptResult(conn, 1) == C_ERR || conn->c.state != CONN_STATE_CONNECTED) return; + } + updateSSLEvent(conn); updatePendingData(conn); } +static void TLSAccept(void *_conn) { + tls_connection *conn = (tls_connection *)_conn; + ERR_clear_error(); + int ret = SSL_accept(conn->ssl); + if (ret > 0) { + conn->flags |= TLS_CONN_FLAG_ACCEPT_SUCCESS; + } else if (handleSSLReturnCode(conn, ret)) { + conn->flags |= TLS_CONN_FLAG_ACCEPT_ERROR; + } +} + static void tlsHandleEvent(tls_connection *conn, int mask) { int ret, conn_error; @@ -676,10 +710,8 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { } ret = SSL_connect(conn->ssl); if (ret <= 0) { - WantIOType want = 0; - if (!handleSSLReturnCode(conn, ret, &want)) { - registerSSLEvent(conn, want); - + if (!handleSSLReturnCode(conn, ret)) { + registerSSLEvent(conn); /* Avoid hitting UpdateSSLEvent, which knows nothing * of what SSL_connect() wants and instead looks at our * R/W handlers. @@ -698,27 +730,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->c.conn_handler = NULL; break; case CONN_STATE_ACCEPTING: - ERR_clear_error(); - ret = SSL_accept(conn->ssl); - if (ret <= 0) { - WantIOType want = 0; - if (!handleSSLReturnCode(conn, ret, &want)) { - /* Avoid hitting UpdateSSLEvent, which knows nothing - * of what SSL_connect() wants and instead looks at our - * R/W handlers. - */ - registerSSLEvent(conn, want); - return; - } - - /* If not handled, it's an error */ - conn->c.state = CONN_STATE_ERROR; - } else { - conn->c.state = CONN_STATE_CONNECTED; - } - - if (!callHandler((connection *)conn, conn->c.conn_handler)) return; - conn->c.conn_handler = NULL; + if (connTLSAccept((connection *)conn, NULL) == C_ERR || conn->c.state != CONN_STATE_CONNECTED) return; break; case CONN_STATE_CONNECTED: { int call_read = ((mask & AE_READABLE) && conn->c.read_handler) || @@ -740,20 +752,17 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { int invert = conn->c.flags & CONN_FLAG_WRITE_BARRIER; if (!invert && call_read) { - conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE; if (!callHandler((connection *)conn, conn->c.read_handler)) return; } /* Fire the writable event. */ if (call_write) { - conn->flags &= ~TLS_CONN_FLAG_WRITE_WANT_READ; if (!callHandler((connection *)conn, conn->c.write_handler)) return; } /* If we have to invert the call, fire the readable event now * after the writable one. */ if (invert && call_read) { - conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE; if (!callHandler((connection *)conn, conn->c.read_handler)) return; } updatePendingData(conn); @@ -845,31 +854,25 @@ static void connTLSClose(connection *conn_) { static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler) { tls_connection *conn = (tls_connection *)_conn; - int ret; - if (conn->c.state != CONN_STATE_ACCEPTING) return C_ERR; - ERR_clear_error(); - + int call_handler_on_error = 1; /* Try to accept */ - conn->c.conn_handler = accept_handler; - ret = SSL_accept(conn->ssl); - - if (ret <= 0) { - WantIOType want = 0; - if (!handleSSLReturnCode(conn, ret, &want)) { - registerSSLEvent(conn, want); /* We'll fire back */ - return C_OK; - } else { - conn->c.state = CONN_STATE_ERROR; - return C_ERR; - } + if (accept_handler) { + conn->c.conn_handler = accept_handler; + call_handler_on_error = 0; } - conn->c.state = CONN_STATE_CONNECTED; - if (!callHandler((connection *)conn, conn->c.conn_handler)) return C_OK; - conn->c.conn_handler = NULL; + /* We're in IO thread - just call accept and return, the main thread will handle the rest */ + if (!inMainThread()) { + TLSAccept(conn); + return C_OK; + } - return C_OK; + /* Try to offload accept to IO threads */ + if (trySendAcceptToIOThreads(_conn) == C_OK) return C_OK; + + TLSAccept(conn); + return TLSHandleAcceptResult(conn, call_handler_on_error); } static int connTLSConnect(connection *conn_, From bfad1106a11086175d3bab6703f217e25d41548d Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Wed, 18 Dec 2024 09:17:11 -0800 Subject: [PATCH 023/101] Attempt to read secondary error from info test (#1452) The test attempts to write 1MB of data in order to trigger a disconnect. Normally, the data is fully flushed and we get the error on the read (I/O error). However, it's possible we might fail the write, which leaves the client in an inconsistent state. On the next command, we finally process the I/O error on the FD. So, the simple fix is to consume any secondary errors. --------- Signed-off-by: Madelyn Olson --- tests/unit/info.tcl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index a27043fa88..11dc4e5d40 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -391,7 +391,13 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} { # set qbuf limit to minimum to test stat set org_qbuf_limit [lindex [r config get client-query-buffer-limit] 1] r config set client-query-buffer-limit 1048576 - catch {r set key [string repeat a 1048576]} + catch {r set key [string repeat a 2048576]} e + # We might get an error on the write path of the previous command, which won't be + # an I/O error based on how the client is designed. We will need to manually consume + # the secondary I/O error. + if {![string match "I/O error*" $e]} { + catch {r read} + } set info [r info stats] assert_equal [getInfoProperty $info client_query_buffer_limit_disconnections] {1} r config set client-query-buffer-limit $org_qbuf_limit From 84c1a44ce96328ddb36e167e0899f4626828b0fc Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Wed, 18 Dec 2024 22:18:02 -0800 Subject: [PATCH 024/101] Add a hint about the current file for TCL debugging (#1459) There are some tests that fail and give no useful information since they are outside of a test context. Now we will at least get the file we are located in. We can sort of reverse engineer where we are in the test by seeing which tests have finished in a file. ``` [TIMEOUT]: clients state report follows. sock6 => (SPAWNED SERVER) pid:30375 - tests/unit/info.tcl Killing still running Valkey server 30375 - tests/unit/info.tcl ``` Signed-off-by: Madelyn Olson --- tests/support/server.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 8c545d900a..bd3135e9d9 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -314,7 +314,7 @@ proc spawn_server {config_file stdout stderr args} { } # Tell the test server about this new instance. - send_data_packet $::test_server_fd server-spawned $pid + send_data_packet $::test_server_fd server-spawned "$pid - $::curfile" return $pid } From fff1573236a9c0965fe235b68e3c547e5a36b219 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 19 Dec 2024 16:12:34 +0800 Subject: [PATCH 025/101] Minor log fixes when failover auth denied due to slot epoch (#1341) The old reqEpoch mainly refers to requestCurrentEpoch, see: ``` if (requestCurrentEpoch < server.cluster->currentEpoch) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", node->name, node->human_nodename, (unsigned long long)requestCurrentEpoch, (unsigned long long)server.cluster->currentEpoch); return; } ``` And in here we refer to requestConfigEpoch, it's a bit misleading, so change it to reqConfigEpoch to make it clear. Signed-off-by: Binbin --- src/cluster_legacy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 9ddcf6678d..bbf63d46b9 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4430,7 +4430,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * by the replica requesting our vote. Refuse to vote for this replica. */ serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): " - "slot %d epoch (%llu) > reqEpoch (%llu)", + "slot %d epoch (%llu) > reqConfigEpoch (%llu)", node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch, (unsigned long long)requestConfigEpoch); return; @@ -4721,8 +4721,8 @@ void clusterHandleReplicaFailover(void) { if (server.cluster->failover_auth_sent == 0) { server.cluster->currentEpoch++; server.cluster->failover_auth_epoch = server.cluster->currentEpoch; - serverLog(LL_NOTICE, "Starting a failover election for epoch %llu.", - (unsigned long long)server.cluster->currentEpoch); + serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu", + (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself)); clusterRequestFailoverAuth(); server.cluster->failover_auth_sent = 1; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); From 4aa656fb393fa243aade9bb7034dd7cba5b7e3fe Mon Sep 17 00:00:00 2001 From: Jungwoo Song <37579681+bluayer@users.noreply.github.com> Date: Fri, 20 Dec 2024 01:32:31 +0900 Subject: [PATCH 026/101] Support for reading from replicas in valkey-benchmark (#1392) **Background** When conducting performance tests using `valkey-benchmark`, reading from replicas was not supported. Consequently, even in cluster mode, all reads were directed to the primary nodes. This limitation made it challenging to obtain accurate metrics during workload stress testing for performance measurement or before a version upgrade. Related issue : https://github.com/valkey-io/valkey/issues/900 **Changes** 1. Replaced the use of `CLUSTER NODES` with `CLUSTER SLOTS` when fetching cluster configuration. This allows for easier identification of replica slots. 2. Support for reading from replicas by executing the client in `READONLY` mode. 3. Support reading from replicas even during slot migrations. 4. Introduced two CLI options `--rfr` to enable reading from replicas only or all cluster nodes. A warning added to indicate that write requests might not be handled correctly when using this option. --------- Signed-off-by: bluayer Signed-off-by: bluayer Signed-off-by: Jungwoo Song <37579681+bluayer@users.noreply.github.com> Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com> --- src/valkey-benchmark.c | 354 +++++++++++++++++++---------------------- 1 file changed, 168 insertions(+), 186 deletions(-) diff --git a/src/valkey-benchmark.c b/src/valkey-benchmark.c index 57cdd6fc16..1924203ae7 100644 --- a/src/valkey-benchmark.c +++ b/src/valkey-benchmark.c @@ -77,6 +77,13 @@ struct benchmarkThread; struct clusterNode; struct serverConfig; +/* Read from replica options */ +typedef enum readFromReplica { + FROM_PRIMARY_ONLY = 0, /* default option */ + FROM_REPLICA_ONLY, + FROM_ALL +} readFromReplica; + static struct config { aeEventLoop *el; cliConnInfo conn_info; @@ -112,6 +119,7 @@ static struct config { int num_threads; struct benchmarkThread **threads; int cluster_mode; + readFromReplica read_from_replica; int cluster_node_count; struct clusterNode **cluster_nodes; struct serverConfig *redis_config; @@ -168,12 +176,6 @@ typedef struct clusterNode { int *updated_slots; /* Used by updateClusterSlotsConfiguration */ int updated_slots_count; /* Used by updateClusterSlotsConfiguration */ int replicas_count; - sds *migrating; /* An array of sds where even strings are slots and odd - * strings are the destination node IDs. */ - sds *importing; /* An array of sds where even strings are slots and odd - * strings are the source node IDs. */ - int migrating_count; /* Length of the migrating array (migrating slots*2) */ - int importing_count; /* Length of the importing array (importing slots*2) */ struct serverConfig *redis_config; } clusterNode; @@ -228,6 +230,15 @@ static int dictSdsKeyCompare(const void *key1, const void *key2) { return memcmp(key1, key2, l1) == 0; } +static dictType dtype = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + dictSdsKeyCompare, /* key compare */ + NULL, /* key destructor */ + NULL, /* val destructor */ + NULL /* allow to expand */ +}; + static redisContext *getRedisContext(const char *ip, int port, const char *hostsocket) { redisContext *ctx = NULL; redisReply *reply = NULL; @@ -710,6 +721,15 @@ static client createClient(char *cmd, size_t len, client from, int thread_id) { c->prefix_pending++; } + if (config.cluster_mode && (config.read_from_replica == FROM_REPLICA_ONLY || config.read_from_replica == FROM_ALL)) { + char *buf = NULL; + int len; + len = redisFormatCommand(&buf, "READONLY"); + c->obuf = sdscatlen(c->obuf, buf, len); + free(buf); + c->prefix_pending++; + } + c->prefixlen = sdslen(c->obuf); /* Append the request itself. */ if (from) { @@ -835,7 +855,15 @@ static void showLatencyReport(void) { printf(" %d bytes payload\n", config.datasize); printf(" keep alive: %d\n", config.keepalive); if (config.cluster_mode) { - printf(" cluster mode: yes (%d primaries)\n", config.cluster_node_count); + const char *node_roles = NULL; + if (config.read_from_replica == FROM_ALL) { + node_roles = "cluster"; + } else if (config.read_from_replica == FROM_REPLICA_ONLY) { + node_roles = "replica"; + } else { + node_roles = "primary"; + } + printf(" cluster mode: yes (%d %s)\n", config.cluster_node_count, node_roles); int m; for (m = 0; m < config.cluster_node_count; m++) { clusterNode *node = config.cluster_nodes[m]; @@ -1009,26 +1037,13 @@ static clusterNode *createClusterNode(char *ip, int port) { node->slots_count = 0; node->updated_slots = NULL; node->updated_slots_count = 0; - node->migrating = NULL; - node->importing = NULL; - node->migrating_count = 0; - node->importing_count = 0; node->redis_config = NULL; return node; } static void freeClusterNode(clusterNode *node) { - int i; if (node->name) sdsfree(node->name); if (node->replicate) sdsfree(node->replicate); - if (node->migrating != NULL) { - for (i = 0; i < node->migrating_count; i++) sdsfree(node->migrating[i]); - zfree(node->migrating); - } - if (node->importing != NULL) { - for (i = 0; i < node->importing_count; i++) sdsfree(node->importing[i]); - zfree(node->importing); - } /* If the node is not the reference node, that uses the address from * config.conn_info.hostip and config.conn_info.hostport, then the node ip has been * allocated by fetchClusterConfiguration, so it must be freed. */ @@ -1056,157 +1071,85 @@ static clusterNode **addClusterNode(clusterNode *node) { return config.cluster_nodes; } -/* TODO: This should be refactored to use CLUSTER SLOTS, the migrating/importing - * information is anyway not used. - */ static int fetchClusterConfiguration(void) { int success = 1; redisContext *ctx = NULL; redisReply *reply = NULL; + dict *nodes = NULL; + const char *errmsg = "Failed to fetch cluster configuration"; + size_t i, j; ctx = getRedisContext(config.conn_info.hostip, config.conn_info.hostport, config.hostsocket); if (ctx == NULL) { exit(1); } - clusterNode *firstNode = createClusterNode((char *)config.conn_info.hostip, config.conn_info.hostport); - if (!firstNode) { + + reply = redisCommand(ctx, "CLUSTER SLOTS"); + if (reply == NULL || reply->type == REDIS_REPLY_ERROR) { success = 0; + if (reply) fprintf(stderr, "%s\nCLUSTER SLOTS ERROR: %s\n", errmsg, reply->str); goto cleanup; } - reply = redisCommand(ctx, "CLUSTER NODES"); - success = (reply != NULL); - if (!success) goto cleanup; - success = (reply->type != REDIS_REPLY_ERROR); - if (!success) { - if (config.hostsocket == NULL) { - fprintf(stderr, "Cluster node %s:%d replied with error:\n%s\n", config.conn_info.hostip, - config.conn_info.hostport, reply->str); - } else { - fprintf(stderr, "Cluster node %s replied with error:\n%s\n", config.hostsocket, reply->str); - } - goto cleanup; - } - char *lines = reply->str, *p, *line; - while ((p = strstr(lines, "\n")) != NULL) { - *p = '\0'; - line = lines; - lines = p + 1; - char *name = NULL, *addr = NULL, *flags = NULL, *primary_id = NULL; - int i = 0; - while ((p = strchr(line, ' ')) != NULL) { - *p = '\0'; - char *token = line; - line = p + 1; - switch (i++) { - case 0: name = token; break; - case 1: addr = token; break; - case 2: flags = token; break; - case 3: primary_id = token; break; - } - if (i == 8) break; // Slots - } - if (!flags) { - fprintf(stderr, "Invalid CLUSTER NODES reply: missing flags.\n"); - success = 0; - goto cleanup; - } - int myself = (strstr(flags, "myself") != NULL); - int is_replica = (strstr(flags, "slave") != NULL || (primary_id != NULL && primary_id[0] != '-')); - if (is_replica) continue; - if (addr == NULL) { - fprintf(stderr, "Invalid CLUSTER NODES reply: missing addr.\n"); - success = 0; - goto cleanup; - } - clusterNode *node = NULL; - char *ip = NULL; - int port = 0; - char *paddr = strrchr(addr, ':'); - if (paddr != NULL) { - *paddr = '\0'; - ip = addr; - addr = paddr + 1; - /* If internal bus is specified, then just drop it. */ - if ((paddr = strchr(addr, '@')) != NULL) *paddr = '\0'; - port = atoi(addr); - } - if (myself) { - node = firstNode; - if (ip != NULL && strcmp(node->ip, ip) != 0) { - node->ip = sdsnew(ip); - node->port = port; + assert(reply->type == REDIS_REPLY_ARRAY); + nodes = dictCreate(&dtype); + for (i = 0; i < reply->elements; i++) { + redisReply *r = reply->element[i]; + assert(r->type == REDIS_REPLY_ARRAY); + assert(r->elements >= 3); + int from = r->element[0]->integer; + int to = r->element[1]->integer; + sds primary = NULL; + for (j = 2; j < r->elements; j++) { + redisReply *nr = r->element[j]; + assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3); + assert(nr->element[0]->str != NULL); + assert(nr->element[2]->str != NULL); + + int is_primary = (j == 2); + if (is_primary) primary = sdsnew(nr->element[2]->str); + int is_cluster_option_only = (config.read_from_replica == FROM_PRIMARY_ONLY); + if ((config.read_from_replica == FROM_REPLICA_ONLY && is_primary) || (is_cluster_option_only && !is_primary)) continue; + + sds ip = sdsnew(nr->element[0]->str); + sds name = sdsnew(nr->element[2]->str); + int port = nr->element[1]->integer; + int slot_start = from; + int slot_end = to; + + clusterNode *node = NULL; + dictEntry *entry = dictFind(nodes, name); + if (entry == NULL) { + node = createClusterNode(sdsnew(ip), port); + if (node == NULL) { + success = 0; + goto cleanup; + } else { + node->name = name; + if (!is_primary) node->replicate = sdsdup(primary); + } + } else { + node = dictGetVal(entry); } - } else { - node = createClusterNode(sdsnew(ip), port); - } - if (node == NULL) { - success = 0; - goto cleanup; - } - if (name != NULL) node->name = sdsnew(name); - if (i == 8) { - int remaining = strlen(line); - while (remaining > 0) { - p = strchr(line, ' '); - if (p == NULL) p = line + remaining; - remaining -= (p - line); - - char *slotsdef = line; - *p = '\0'; - if (remaining) { - line = p + 1; - remaining--; - } else - line = p; - char *dash = NULL; - if (slotsdef[0] == '[') { - slotsdef++; - if ((p = strstr(slotsdef, "->-"))) { // Migrating - *p = '\0'; - p += 3; - char *closing_bracket = strchr(p, ']'); - if (closing_bracket) *closing_bracket = '\0'; - sds slot = sdsnew(slotsdef); - sds dst = sdsnew(p); - node->migrating_count += 2; - node->migrating = zrealloc(node->migrating, (node->migrating_count * sizeof(sds))); - node->migrating[node->migrating_count - 2] = slot; - node->migrating[node->migrating_count - 1] = dst; - } else if ((p = strstr(slotsdef, "-<-"))) { // Importing - *p = '\0'; - p += 3; - char *closing_bracket = strchr(p, ']'); - if (closing_bracket) *closing_bracket = '\0'; - sds slot = sdsnew(slotsdef); - sds src = sdsnew(p); - node->importing_count += 2; - node->importing = zrealloc(node->importing, (node->importing_count * sizeof(sds))); - node->importing[node->importing_count - 2] = slot; - node->importing[node->importing_count - 1] = src; - } - } else if ((dash = strchr(slotsdef, '-')) != NULL) { - p = dash; - int start, stop; - *p = '\0'; - start = atoi(slotsdef); - stop = atoi(p + 1); - while (start <= stop) { - int slot = start++; - node->slots[node->slots_count++] = slot; - } - } else if (p > slotsdef) { - int slot = atoi(slotsdef); + if (slot_start == slot_end) { + node->slots[node->slots_count++] = slot_start; + } else { + while (slot_start <= slot_end) { + int slot = slot_start++; node->slots[node->slots_count++] = slot; } } + if (node->slots_count == 0) { + fprintf(stderr, "WARNING: Node %s:%d has no slots, skipping...\n", node->ip, node->port); + continue; + } + if (entry == NULL) { + dictReplace(nodes, node->name, node); + if (!addClusterNode(node)) { + success = 0; + goto cleanup; + } + } } - if (node->slots_count == 0) { - fprintf(stderr, "WARNING: Primary node %s:%d has no slots, skipping...\n", node->ip, node->port); - continue; - } - if (!addClusterNode(node)) { - success = 0; - goto cleanup; - } + sdsfree(primary); } cleanup: if (ctx) redisFree(ctx); @@ -1214,6 +1157,7 @@ static int fetchClusterConfiguration(void) { if (config.cluster_nodes) freeClusterNodes(); } if (reply) freeReplyObject(reply); + if (nodes) dictRelease(nodes); return success; } @@ -1222,7 +1166,7 @@ static int fetchClusterConfiguration(void) { static int fetchClusterSlotsConfiguration(client c) { UNUSED(c); int success = 1, is_fetching_slots = 0, last_update = 0; - size_t i; + size_t i, j; last_update = atomic_load_explicit(&config.slots_last_update, memory_order_relaxed); if (c->slots_last_update < last_update) { @@ -1236,16 +1180,9 @@ static int fetchClusterSlotsConfiguration(client c) { atomic_store_explicit(&config.is_fetching_slots, 1, memory_order_relaxed); fprintf(stderr, "WARNING: Cluster slots configuration changed, fetching new one...\n"); const char *errmsg = "Failed to update cluster slots configuration"; - static dictType dtype = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - dictSdsKeyCompare, /* key compare */ - NULL, /* key destructor */ - NULL, /* val destructor */ - NULL /* allow to expand */ - }; + /* printf("[%d] fetchClusterSlotsConfiguration\n", c->thread_id); */ - dict *primaries = dictCreate(&dtype); + dict *nodes = dictCreate(&dtype); redisContext *ctx = NULL; for (i = 0; i < (size_t)config.cluster_node_count; i++) { clusterNode *node = config.cluster_nodes[i]; @@ -1263,7 +1200,7 @@ static int fetchClusterSlotsConfiguration(client c) { if (node->updated_slots != NULL) zfree(node->updated_slots); node->updated_slots = NULL; node->updated_slots_count = 0; - dictReplace(primaries, node->name, node); + dictReplace(nodes, node->name, node); } reply = redisCommand(ctx, "CLUSTER SLOTS"); if (reply == NULL || reply->type == REDIS_REPLY_ERROR) { @@ -1279,30 +1216,44 @@ static int fetchClusterSlotsConfiguration(client c) { int from, to, slot; from = r->element[0]->integer; to = r->element[1]->integer; - redisReply *nr = r->element[2]; - assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3); - assert(nr->element[2]->str != NULL); - sds name = sdsnew(nr->element[2]->str); - dictEntry *entry = dictFind(primaries, name); - if (entry == NULL) { - success = 0; - fprintf(stderr, - "%s: could not find node with ID %s in current " - "configuration.\n", - errmsg, name); - if (name) sdsfree(name); - goto cleanup; + size_t start, end; + if (config.read_from_replica == FROM_ALL) { + start = 2; + end = r->elements; + } else if (config.read_from_replica == FROM_REPLICA_ONLY) { + start = 3; + end = r->elements; + } else { + start = 2; + end = 3; + } + + for (j = start; j < end; j++) { + redisReply *nr = r->element[j]; + assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3); + assert(nr->element[2]->str != NULL); + sds name = sdsnew(nr->element[2]->str); + dictEntry *entry = dictFind(nodes, name); + if (entry == NULL) { + success = 0; + fprintf(stderr, + "%s: could not find node with ID %s in current " + "configuration.\n", + errmsg, name); + if (name) sdsfree(name); + goto cleanup; + } + sdsfree(name); + clusterNode *node = dictGetVal(entry); + if (node->updated_slots == NULL) node->updated_slots = zcalloc(CLUSTER_SLOTS * sizeof(int)); + for (slot = from; slot <= to; slot++) node->updated_slots[node->updated_slots_count++] = slot; } - sdsfree(name); - clusterNode *node = dictGetVal(entry); - if (node->updated_slots == NULL) node->updated_slots = zcalloc(CLUSTER_SLOTS * sizeof(int)); - for (slot = from; slot <= to; slot++) node->updated_slots[node->updated_slots_count++] = slot; } updateClusterSlotsConfiguration(); cleanup: freeReplyObject(reply); redisFree(ctx); - dictRelease(primaries); + dictRelease(nodes); atomic_store_explicit(&config.is_fetching_slots, 0, memory_order_relaxed); return success; } @@ -1460,6 +1411,19 @@ int parseOptions(int argc, char **argv) { config.num_threads = 0; } else if (!strcmp(argv[i], "--cluster")) { config.cluster_mode = 1; + } else if (!strcmp(argv[i], "--rfr")) { + if (argv[++i]) { + if (!strcmp(argv[i], "all")) { + config.read_from_replica = FROM_ALL; + } else if (!strcmp(argv[i], "yes")) { + config.read_from_replica = FROM_REPLICA_ONLY; + } else if (!strcmp(argv[i], "no")) { + config.read_from_replica = FROM_PRIMARY_ONLY; + } else { + goto invalid; + } + } else + goto invalid; } else if (!strcmp(argv[i], "--enable-tracking")) { config.enable_tracking = 1; } else if (!strcmp(argv[i], "--help")) { @@ -1557,6 +1521,14 @@ int parseOptions(int argc, char **argv) { " If the command is supplied on the command line in cluster\n" " mode, the key must contain \"{tag}\". Otherwise, the\n" " command will not be sent to the right cluster node.\n" + " --rfr Enable read from replicas in cluster mode.\n" + " This command must be used with the --cluster option.\n" + " There are three modes for reading from replicas:\n" + " 'no' - sends read requests to primaries only (default) \n" + " 'yes' - sends read requests to replicas only.\n" + " 'all' - sends read requests to all nodes.\n" + " Since write commands will not be accepted by replicas,\n" + " it is recommended to enable read from replicas only for read command tests.\n" " --enable-tracking Send CLIENT TRACKING on before starting benchmark.\n" " -k 1=keep alive 0=reconnect (default 1)\n" " -r Use random keys for SET/GET/INCR, random values for SADD,\n" @@ -1698,6 +1670,7 @@ int main(int argc, char **argv) { config.num_threads = 0; config.threads = NULL; config.cluster_mode = 0; + config.read_from_replica = FROM_PRIMARY_ONLY; config.cluster_node_count = 0; config.cluster_nodes = NULL; config.redis_config = NULL; @@ -1742,7 +1715,15 @@ int main(int argc, char **argv) { fprintf(stderr, "Invalid cluster: %d node(s).\n", config.cluster_node_count); exit(1); } - printf("Cluster has %d primary nodes:\n\n", config.cluster_node_count); + const char *node_roles = NULL; + if (config.read_from_replica == FROM_ALL) { + node_roles = "cluster"; + } else if (config.read_from_replica == FROM_REPLICA_ONLY) { + node_roles = "replica"; + } else { + node_roles = "primary"; + } + printf("Cluster has %d %s nodes:\n\n", config.cluster_node_count, node_roles); int i = 0; for (; i < config.cluster_node_count; i++) { clusterNode *node = config.cluster_nodes[i]; @@ -1750,7 +1731,8 @@ int main(int argc, char **argv) { fprintf(stderr, "Invalid cluster node #%d\n", i); exit(1); } - printf("Primary %d: ", i); + const char *node_type = (node->replicate == NULL ? "Primary" : "Replica"); + printf("Node %d(%s): ", i, node_type); if (node->name) printf("%s ", node->name); printf("%s:%d\n", node->ip, node->port); node->redis_config = getServerConfig(node->ip, node->port, NULL); From 7f0cc83428ba71e460bd7b01e53673c84e5beb11 Mon Sep 17 00:00:00 2001 From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:32:40 -0800 Subject: [PATCH 027/101] Workflow changes to fix old release binaries (#1461) - Moves `build-config.json` to workflow dir to build old versions with new configs. - Enables contributors to test release Wf on private repo by adding `github.event_name == 'workflow_dispatch' ||` --------- Signed-off-by: Roshan Khatri --- .github/actions/generate-package-build-matrix/action.yml | 4 ++-- .../generate-package-build-matrix}/build-config.json | 0 .github/workflows/build-release-packages.yml | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) rename {utils/releasetools => .github/actions/generate-package-build-matrix}/build-config.json (100%) diff --git a/.github/actions/generate-package-build-matrix/action.yml b/.github/actions/generate-package-build-matrix/action.yml index 7e90f27be5..2494a71118 100644 --- a/.github/actions/generate-package-build-matrix/action.yml +++ b/.github/actions/generate-package-build-matrix/action.yml @@ -24,11 +24,11 @@ runs: - name: Get targets run: | - x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' utils/releasetools/build-config.json) + x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' .github/actions/generate-package-build-matrix/build-config.json) x86_matrix=$(echo "{ \"distro\" : $x86_arch }" | jq -c .) echo "X86_MATRIX=$x86_matrix" >> $GITHUB_ENV - arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' utils/releasetools/build-config.json) + arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' .github/actions/generate-package-build-matrix/build-config.json) arm_matrix=$(echo "{ \"distro\" : $arm_arch }" | jq -c .) echo "ARM_MATRIX=$arm_matrix" >> $GITHUB_ENV shell: bash diff --git a/utils/releasetools/build-config.json b/.github/actions/generate-package-build-matrix/build-config.json similarity index 100% rename from utils/releasetools/build-config.json rename to .github/actions/generate-package-build-matrix/build-config.json diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml index 3f1ca2627b..d7ab8e57d6 100644 --- a/.github/workflows/build-release-packages.yml +++ b/.github/workflows/build-release-packages.yml @@ -8,7 +8,7 @@ on: - '.github/workflows/build-release-packages.yml' - '.github/workflows/call-build-linux-arm-packages.yml' - '.github/workflows/call-build-linux-x86-packages.yml' - - 'utils/releasetools/build-config.json' + - '.github/actions/generate-package-build-matrix/build-config.json' workflow_dispatch: inputs: version: @@ -23,7 +23,7 @@ jobs: # This job provides the version metadata from the tag for the other jobs to use. release-build-get-meta: name: Get metadata to build - if: github.repository == 'valkey-io/valkey' + if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey' runs-on: ubuntu-latest outputs: version: ${{ steps.get_version.outputs.VERSION }} @@ -69,7 +69,7 @@ jobs: generate-build-matrix: name: Generating build matrix - if: github.repository == 'valkey-io/valkey' + if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey' runs-on: ubuntu-latest outputs: x86_64-build-matrix: ${{ steps.set-matrix.outputs.x86_64-build-matrix }} From cfa292afd58a78235a208bf402a7b254d9c878b2 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 20 Dec 2024 10:14:01 +0800 Subject: [PATCH 028/101] Clear outdated failure reports more accurately (#1184) There are two changes here: 1. The one in clusterNodeCleanupFailureReports, only primary with slots can report the failure report, if the primary became a replica its failure report should be cleared. This may lead to inaccurate node fail judgment in some network partition cases i guess, it will also affect the CLUSTER COUNT-FAILURE-REPORTS command. 2. The one in clusterProcessGossipSection, it is not that important, but it can print a "node is back online" log helps us troubleshoot the problem, although it may conflict with 1 at some points. Signed-off-by: Binbin --- src/cluster_legacy.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index bbf63d46b9..876beef91f 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1552,9 +1552,14 @@ int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { * older than the global node timeout. Note that anyway for a node to be * flagged as FAIL we need to have a local PFAIL state that is at least * older than the global node timeout, so we don't just trust the number - * of failure reports from other nodes. */ + * of failure reports from other nodes. + * + * If the reporting node loses its voting right during this time, we will + * also clear its report. */ void clusterNodeCleanupFailureReports(clusterNode *node) { list *l = node->fail_reports; + if (!listLength(l)) return; + listNode *ln; listIter li; clusterNodeFailReport *fr; @@ -1564,7 +1569,11 @@ void clusterNodeCleanupFailureReports(clusterNode *node) { listRewind(l, &li); while ((ln = listNext(&li)) != NULL) { fr = ln->value; - if (now - fr->time > maxtime) listDelNode(l, ln); + if (now - fr->time > maxtime) { + listDelNode(l, ln); + } else if (!clusterNodeIsVotingPrimary(fr->node)) { + listDelNode(l, ln); + } } } @@ -1581,6 +1590,8 @@ void clusterNodeCleanupFailureReports(clusterNode *node) { * Otherwise 0 is returned. */ int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) { list *l = node->fail_reports; + if (!listLength(l)) return 0; + listNode *ln; listIter li; clusterNodeFailReport *fr; @@ -2254,10 +2265,11 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Ignore gossips about self. */ if (node && node != myself) { /* We already know this node. - Handle failure reports, only when the sender is a voting primary. */ - if (sender && clusterNodeIsVotingPrimary(sender)) { + * Handle failure reports, the report is added only if the sender is a voting primary, + * and deletion of a failure report is not restricted. */ + if (sender) { if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) { - if (clusterNodeAddFailureReport(node, sender)) { + if (clusterNodeIsVotingPrimary(sender) && clusterNodeAddFailureReport(node, sender)) { serverLog(LL_NOTICE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", sender->name, sender->human_nodename, node->name, node->human_nodename); } From 13419cbda29ea0201dea2496326eb130147e7e0e Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Thu, 19 Dec 2024 18:14:56 -0800 Subject: [PATCH 029/101] Fix storing the wrong PID in active servers (#1464) In #1459, I missed that the data was also used to keep track of the PID files so if the testing framework crashed it would no longer be able to cleanup the extra servers. So now we properly extract the PID and store it so we can clean up PIDs. Signed-off-by: Madelyn Olson --- tests/test_helper.tcl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 8a4125e48d..54bb923674 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -421,7 +421,8 @@ proc read_from_test_client fd { } elseif {$status eq {server-spawning}} { set ::active_clients_task($fd) "(SPAWNING SERVER) $data" } elseif {$status eq {server-spawned}} { - lappend ::active_servers $data + set pid [string trim [lindex [split $data "-"] 0]] + lappend ::active_servers $pid set ::active_clients_task($fd) "(SPAWNED SERVER) pid:$data" } elseif {$status eq {server-killing}} { set ::active_clients_task($fd) "(KILLING SERVER) pid:$data" From b3daabe14a7b0e7ebd7630bd7caf263a2b4d0bac Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Thu, 19 Dec 2024 18:16:46 -0800 Subject: [PATCH 030/101] Update info.tcl test to revert client output limits sooner (#1462) We set the client output buffer limits to 10 bytes, and then execute `info stats` which produces more than 10 bytes of output, which can cause that command to throw an error. I'm not sure why it wasn't consistently erroring before, might have been some change related to the ubuntu upgrade though. Issues related to ubuntu-tls are hopefully resolved now. Signed-off-by: Madelyn Olson --- tests/unit/info.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 11dc4e5d40..3295c5e31a 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -406,10 +406,10 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} { r config set client-output-buffer-limit "normal 10 0 0" r set key [string repeat a 100000] ;# to trigger output buffer limit check this needs to be big catch {r get key} + r config set client-output-buffer-limit $org_outbuf_limit set info [r info stats] assert_equal [getInfoProperty $info client_output_buffer_limit_disconnections] {1} - r config set client-output-buffer-limit $org_outbuf_limit - } {OK} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres + } {} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres test {clients: pubsub clients} { set info [r info clients] From beb95d334aaec898aab0df9c8020292b2046e3f1 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Fri, 20 Dec 2024 12:10:48 -0800 Subject: [PATCH 031/101] Resolve bounds checks on cluster_legacy.c (#1463) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are getting a number of errors like: ``` array subscript ‘clusterMsg[0]’ is partly outside array bounds of ‘unsigned char[2272]’ ``` Which is basically GCC telling us that we have an object which is longer than the underlying storage of the allocation. We actually do this a lot, but GCC is generally not aware of how big the underlying allocation is, so it doesn't throw this error. We are specifically getting this error because the msgBlock can be of variable length depending on the type of message, but GCC assumes it's the longest one possible. The solution I went with here was make the message type optional, so that it wasn't included in the size. I think this also makes some sense, since it's really just a helper for us to easily cast the object around. I considered disabling this error, but it is generally pretty useful since it can catch real issues. Another solution would be to over-allocate to the largest possible object, which could hurt performance as we initialize it to zero. Results: https://github.com/madolson/valkey/actions/runs/12423414811/job/34686899884 This is a slightly cleaned up version of https://github.com/valkey-io/valkey/pull/1439. I thought I had another strategy but alas, it didn't work out. Signed-off-by: Madelyn Olson --- src/cluster_legacy.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 876beef91f..9a23527b30 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -424,9 +424,19 @@ typedef struct { union { clusterMsg msg; clusterMsgLight msg_light; - }; + } data[]; } clusterMsgSendBlock; +/* Helper function to extract a normal message from a send block. */ +static clusterMsgLight *getLightMessageFromSendBlock(clusterMsgSendBlock *msgblock) { + return &msgblock->data[0].msg_light; +} + +/* Helper function to extract a light message from a send block. */ +static clusterMsg *getMessageFromSendBlock(clusterMsgSendBlock *msgblock) { + return &msgblock->data[0].msg; +} + /* ----------------------------------------------------------------------------- * Initialization * -------------------------------------------------------------------------- */ @@ -1288,15 +1298,15 @@ void clusterReset(int hard) { * CLUSTER communication link * -------------------------------------------------------------------------- */ clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) { - uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, msg); + uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, data); clusterMsgSendBlock *msgblock = zcalloc(blocklen); msgblock->refcount = 1; msgblock->totlen = blocklen; server.stat_cluster_links_memory += blocklen; if (IS_LIGHT_MESSAGE(type)) { - clusterBuildMessageHdrLight(&msgblock->msg_light, type, msglen); + clusterBuildMessageHdrLight(getLightMessageFromSendBlock(msgblock), type, msglen); } else { - clusterBuildMessageHdr(&msgblock->msg, type, msglen); + clusterBuildMessageHdr(getMessageFromSendBlock(msgblock), type, msglen); } return msgblock; } @@ -3668,7 +3678,7 @@ void clusterWriteHandler(connection *conn) { while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) { listNode *head = listFirst(link->send_msg_queue); clusterMsgSendBlock *msgblock = (clusterMsgSendBlock *)head->value; - clusterMsg *msg = &msgblock->msg; + clusterMsg *msg = getMessageFromSendBlock(msgblock); size_t msg_offset = link->head_msg_send_offset; size_t msg_len = ntohl(msg->totlen); @@ -3853,7 +3863,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { if (!link) { return; } - if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0) + if (listLength(link->send_msg_queue) == 0 && getMessageFromSendBlock(msgblock)->totlen != 0) connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1); listAddNodeTail(link->send_msg_queue, msgblock); @@ -3864,7 +3874,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { server.stat_cluster_links_memory += sizeof(listNode); /* Populate sent messages stats. */ - uint16_t type = ntohs(msgblock->msg.type); + uint16_t type = ntohs(getMessageFromSendBlock(msgblock)->type); if (type < CLUSTERMSG_TYPE_COUNT) server.cluster->stats_bus_messages_sent[type]++; } @@ -4050,7 +4060,7 @@ void clusterSendPing(clusterLink *link, int type) { * sizeof(clusterMsg) or more. */ if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); if (!link->inbound && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime(); @@ -4195,10 +4205,10 @@ clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen); clusterMsgDataPublish *hdr_data_msg; if (is_light) { - clusterMsgLight *hdr_light = &msgblock->msg_light; + clusterMsgLight *hdr_light = getLightMessageFromSendBlock(msgblock); hdr_data_msg = &hdr_light->data.publish.msg; } else { - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); hdr_data_msg = &hdr->data.publish.msg; } hdr_data_msg->channel_len = htonl(channel_len); @@ -4221,7 +4231,7 @@ void clusterSendFail(char *nodename) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataFail); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); memcpy(hdr->data.fail.about.nodename, nodename, CLUSTER_NAMELEN); clusterBroadcastMessage(msgblock); @@ -4237,7 +4247,7 @@ void clusterSendUpdate(clusterLink *link, clusterNode *node) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataUpdate); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); memcpy(hdr->data.update.nodecfg.nodename, node->name, CLUSTER_NAMELEN); hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); memcpy(hdr->data.update.nodecfg.slots, node->slots, sizeof(node->slots)); @@ -4259,7 +4269,7 @@ void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, cons msglen += sizeof(clusterMsgModule) - 3 + len; clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */ hdr->data.module.msg.type = type; hdr->data.module.msg.len = htonl(len); @@ -4348,11 +4358,10 @@ void clusterRequestFailoverAuth(void) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen); - clusterMsg *hdr = &msgblock->msg; /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit * in the header to communicate the nodes receiving the message that * they should authorized the failover even if the primary is working. */ - if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; + if (server.cluster->mf_end) getMessageFromSendBlock(msgblock)->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; clusterBroadcastMessage(msgblock); clusterMsgSendBlockDecrRefCount(msgblock); } From 32b09c67d378dc348338075a9a282b8a7e0233f9 Mon Sep 17 00:00:00 2001 From: Ricardo Dias Date: Sat, 21 Dec 2024 22:09:35 +0000 Subject: [PATCH 032/101] Adds support for scripting engines as Valkey modules (#1277) This PR extends the module API to support the addition of different scripting engines to execute user defined functions. The scripting engine can be implemented as a Valkey module, and can be dynamically loaded with the `loadmodule` config directive, or with the `MODULE LOAD` command. This PR also adds an example of a dummy scripting engine module, to show how to use the new module API. The dummy module is implemented in `tests/modules/helloscripting.c`. The current module API support, only allows to load scripting engines to run functions using `FCALL` command. The additions to the module API are the following: ```c /* This struct represents a scripting engine function that results from the * compilation of a script by the engine implementation. */ struct ValkeyModuleScriptingEngineCompiledFunction typedef ValkeyModuleScriptingEngineCompiledFunction **(*ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc)( ValkeyModuleScriptingEngineCtx *engine_ctx, const char *code, size_t timeout, size_t *out_num_compiled_functions, char **err); typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)( ValkeyModuleCtx *module_ctx, ValkeyModuleScriptingEngineCtx *engine_ctx, ValkeyModuleScriptingEngineFunctionCtx *func_ctx, void *compiled_function, ValkeyModuleString **keys, size_t nkeys, ValkeyModuleString **args, size_t nargs); typedef size_t (*ValkeyModuleScriptingEngineGetUsedMemoryFunc)( ValkeyModuleScriptingEngineCtx *engine_ctx); typedef size_t (*ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc)( void *compiled_function); typedef size_t (*ValkeyModuleScriptingEngineGetEngineMemoryOverheadFunc)( ValkeyModuleScriptingEngineCtx *engine_ctx); typedef void (*ValkeyModuleScriptingEngineFreeFunctionFunc)( ValkeyModuleScriptingEngineCtx *engine_ctx, void *compiled_function); /* This struct stores the callback functions implemented by the scripting * engine to provide the functionality for the `FUNCTION *` commands. */ typedef struct ValkeyModuleScriptingEngineMethodsV1 { uint64_t version; /* Version of this structure for ABI compat. */ /* Library create function callback. When a new script is loaded, this * callback will be called with the script code, and returns a list of * ValkeyModuleScriptingEngineCompiledFunc objects. */ ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc create_functions_library; /* The callback function called when `FCALL` command is called on a function * registered in this engine. */ ValkeyModuleScriptingEngineCallFunctionFunc call_function; /* Function callback to get current used memory by the engine. */ ValkeyModuleScriptingEngineGetUsedMemoryFunc get_used_memory; /* Function callback to return memory overhead for a given function. */ ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc get_function_memory_overhead; /* Function callback to return memory overhead of the engine. */ ValkeyModuleScriptingEngineGetEngineMemoryOverheadFunc get_engine_memory_overhead; /* Function callback to free the memory of a registered engine function. */ ValkeyModuleScriptingEngineFreeFunctionFunc free_function; } ValkeyModuleScriptingEngineMethodsV1; /* Registers a new scripting engine in the server. * * - `engine_name`: the name of the scripting engine. This name will match * against the engine name specified in the script header using a shebang. * * - `engine_ctx`: engine specific context pointer. * * - `engine_methods`: the struct with the scripting engine callback functions * pointers. */ int ValkeyModule_RegisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name, void *engine_ctx, ValkeyModuleScriptingEngineMethods engine_methods); /* Removes the scripting engine from the server. * * `engine_name` is the name of the scripting engine. * */ int ValkeyModule_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name); ``` --------- Signed-off-by: Ricardo Dias --- src/function_lua.c | 205 +++++++----- src/functions.c | 240 ++++++++++++-- src/functions.h | 74 +++-- src/module.c | 76 +++++ src/module.h | 17 + src/script.h | 2 + src/script_lua.c | 6 +- src/script_lua.h | 2 +- src/util.c | 21 ++ src/util.h | 1 + src/valkeymodule.h | 99 +++++- tests/modules/CMakeLists.txt | 1 + tests/modules/Makefile | 3 +- tests/modules/helloscripting.c | 383 +++++++++++++++++++++++ tests/unit/functions.tcl | 4 +- tests/unit/moduleapi/scriptingengine.tcl | 126 ++++++++ 16 files changed, 1124 insertions(+), 136 deletions(-) create mode 100644 src/module.h create mode 100644 tests/modules/helloscripting.c create mode 100644 tests/unit/moduleapi/scriptingengine.tcl diff --git a/src/function_lua.c b/src/function_lua.c index fa9983bf7e..b535528906 100644 --- a/src/function_lua.c +++ b/src/function_lua.c @@ -64,17 +64,14 @@ typedef struct luaFunctionCtx { } luaFunctionCtx; typedef struct loadCtx { - functionLibInfo *li; + list *functions; monotime start_time; size_t timeout; } loadCtx; -typedef struct registerFunctionArgs { - sds name; - sds desc; - luaFunctionCtx *lua_f_ctx; - uint64_t f_flags; -} registerFunctionArgs; +static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + void *compiled_function); /* Hook for FUNCTION LOAD execution. * Used to cancel the execution in case of a timeout (500ms). @@ -93,15 +90,42 @@ static void luaEngineLoadHook(lua_State *lua, lua_Debug *ar) { } } +static void freeCompiledFunc(ValkeyModuleCtx *module_ctx, + luaEngineCtx *lua_engine_ctx, + void *compiled_func) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + + compiledFunction *func = compiled_func; + decrRefCount(func->name); + if (func->desc) { + decrRefCount(func->desc); + } + luaEngineFreeFunction(module_ctx, lua_engine_ctx, func->function); + zfree(func); +} + /* - * Compile a given blob and save it on the registry. - * Return a function ctx with Lua ref that allows to later retrieve the - * function from the registry. + * Compile a given script code by generating a set of compiled functions. These + * functions are also saved into the the registry of the Lua environment. + * + * Returns an array of compiled functions. The `compileFunction` struct stores a + * Lua ref that allows to later retrieve the function from the registry. + * In the `out_num_compiled_functions` parameter is returned the size of the + * array. * * Return NULL on compilation error and set the error to the err variable */ -static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size_t timeout, sds *err) { - int ret = C_ERR; +static compiledFunction **luaEngineCreate(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + char **err) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + + compiledFunction **compiled_functions = NULL; luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; @@ -114,15 +138,15 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size lua_pop(lua, 1); /* pop the metatable */ /* compile the code */ - if (luaL_loadbuffer(lua, blob, sdslen(blob), "@user_function")) { - *err = sdscatprintf(sdsempty(), "Error compiling function: %s", lua_tostring(lua, -1)); + if (luaL_loadbuffer(lua, code, strlen(code), "@user_function")) { + *err = valkey_asprintf("Error compiling function: %s", lua_tostring(lua, -1)); lua_pop(lua, 1); /* pops the error */ goto done; } serverAssert(lua_isfunction(lua, -1)); loadCtx load_ctx = { - .li = li, + .functions = listCreate(), .start_time = getMonotonicUs(), .timeout = timeout, }; @@ -133,13 +157,31 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size if (lua_pcall(lua, 0, 0, 0)) { errorInfo err_info = {0}; luaExtractErrorInformation(lua, &err_info); - *err = sdscatprintf(sdsempty(), "Error registering functions: %s", err_info.msg); + *err = valkey_asprintf("Error registering functions: %s", err_info.msg); lua_pop(lua, 1); /* pops the error */ luaErrorInformationDiscard(&err_info); + listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD); + listNode *node = NULL; + while ((node = listNext(iter)) != NULL) { + freeCompiledFunc(module_ctx, lua_engine_ctx, listNodeValue(node)); + } + listReleaseIterator(iter); + listRelease(load_ctx.functions); goto done; } - ret = C_OK; + compiled_functions = + zcalloc(sizeof(compiledFunction *) * listLength(load_ctx.functions)); + listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD); + listNode *node = NULL; + *out_num_compiled_functions = 0; + while ((node = listNext(iter)) != NULL) { + compiledFunction *func = listNodeValue(node); + compiled_functions[*out_num_compiled_functions] = func; + (*out_num_compiled_functions)++; + } + listReleaseIterator(iter); + listRelease(load_ctx.functions); done: /* restore original globals */ @@ -152,19 +194,23 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size lua_sethook(lua, NULL, 0, 0); /* Disable hook */ luaSaveOnRegistry(lua, REGISTRY_LOAD_CTX_NAME, NULL); - return ret; + return compiled_functions; } /* * Invole the give function with the given keys and args */ -static void luaEngineCall(scriptRunCtx *run_ctx, - void *engine_ctx, +static void luaEngineCall(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + functionCtx *func_ctx, void *compiled_function, robj **keys, size_t nkeys, robj **args, size_t nargs) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; luaFunctionCtx *f_ctx = compiled_function; @@ -177,25 +223,38 @@ static void luaEngineCall(scriptRunCtx *run_ctx, serverAssert(lua_isfunction(lua, -1)); + scriptRunCtx *run_ctx = (scriptRunCtx *)func_ctx; luaCallFunction(run_ctx, lua, keys, nkeys, args, nargs, 0); lua_pop(lua, 1); /* Pop error handler */ } -static size_t luaEngineGetUsedMemoy(void *engine_ctx) { +static engineMemoryInfo luaEngineGetMemoryInfo(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + luaEngineCtx *lua_engine_ctx = engine_ctx; - return luaMemory(lua_engine_ctx->lua); + + return (engineMemoryInfo){ + .used_memory = luaMemory(lua_engine_ctx->lua), + .engine_memory_overhead = zmalloc_size(lua_engine_ctx), + }; } -static size_t luaEngineFunctionMemoryOverhead(void *compiled_function) { +static size_t luaEngineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx, + void *compiled_function) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + return zmalloc_size(compiled_function); } -static size_t luaEngineMemoryOverhead(void *engine_ctx) { - luaEngineCtx *lua_engine_ctx = engine_ctx; - return zmalloc_size(lua_engine_ctx); -} +static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + void *compiled_function) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); -static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) { luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; luaFunctionCtx *f_ctx = compiled_function; @@ -203,26 +262,19 @@ static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) { zfree(f_ctx); } -static void luaRegisterFunctionArgsInitialize(registerFunctionArgs *register_f_args, - sds name, - sds desc, +static void luaRegisterFunctionArgsInitialize(compiledFunction *func, + robj *name, + robj *desc, luaFunctionCtx *lua_f_ctx, uint64_t flags) { - *register_f_args = (registerFunctionArgs){ + *func = (compiledFunction){ .name = name, .desc = desc, - .lua_f_ctx = lua_f_ctx, + .function = lua_f_ctx, .f_flags = flags, }; } -static void luaRegisterFunctionArgsDispose(lua_State *lua, registerFunctionArgs *register_f_args) { - sdsfree(register_f_args->name); - if (register_f_args->desc) sdsfree(register_f_args->desc); - lua_unref(lua, register_f_args->lua_f_ctx->lua_function_ref); - zfree(register_f_args->lua_f_ctx); -} - /* Read function flags located on the top of the Lua stack. * On success, return C_OK and set the flags to 'flags' out parameter * Return C_ERR if encounter an unknown flag. */ @@ -267,10 +319,11 @@ static int luaRegisterFunctionReadFlags(lua_State *lua, uint64_t *flags) { return ret; } -static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadNamedArgs(lua_State *lua, + compiledFunction *func) { char *err = NULL; - sds name = NULL; - sds desc = NULL; + robj *name = NULL; + robj *desc = NULL; luaFunctionCtx *lua_f_ctx = NULL; uint64_t flags = 0; if (!lua_istable(lua, 1)) { @@ -287,14 +340,15 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs err = "named argument key given to server.register_function is not a string"; goto error; } + const char *key = lua_tostring(lua, -2); if (!strcasecmp(key, "function_name")) { - if (!(name = luaGetStringSds(lua, -1))) { + if (!(name = luaGetStringObject(lua, -1))) { err = "function_name argument given to server.register_function must be a string"; goto error; } } else if (!strcasecmp(key, "description")) { - if (!(desc = luaGetStringSds(lua, -1))) { + if (!(desc = luaGetStringObject(lua, -1))) { err = "description argument given to server.register_function must be a string"; goto error; } @@ -335,13 +389,17 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs goto error; } - luaRegisterFunctionArgsInitialize(register_f_args, name, desc, lua_f_ctx, flags); + luaRegisterFunctionArgsInitialize(func, + name, + desc, + lua_f_ctx, + flags); return C_OK; error: - if (name) sdsfree(name); - if (desc) sdsfree(desc); + if (name) decrRefCount(name); + if (desc) decrRefCount(desc); if (lua_f_ctx) { lua_unref(lua, lua_f_ctx->lua_function_ref); zfree(lua_f_ctx); @@ -350,11 +408,12 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs return C_ERR; } -static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, + compiledFunction *func) { char *err = NULL; - sds name = NULL; + robj *name = NULL; luaFunctionCtx *lua_f_ctx = NULL; - if (!(name = luaGetStringSds(lua, 1))) { + if (!(name = luaGetStringObject(lua, 1))) { err = "first argument to server.register_function must be a string"; goto error; } @@ -369,17 +428,17 @@ static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctio lua_f_ctx = zmalloc(sizeof(*lua_f_ctx)); lua_f_ctx->lua_function_ref = lua_function_ref; - luaRegisterFunctionArgsInitialize(register_f_args, name, NULL, lua_f_ctx, 0); + luaRegisterFunctionArgsInitialize(func, name, NULL, lua_f_ctx, 0); return C_OK; error: - if (name) sdsfree(name); + if (name) decrRefCount(name); luaPushError(lua, err); return C_ERR; } -static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadArgs(lua_State *lua, compiledFunction *func) { int argc = lua_gettop(lua); if (argc < 1 || argc > 2) { luaPushError(lua, "wrong number of arguments to server.register_function"); @@ -387,33 +446,28 @@ static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *reg } if (argc == 1) { - return luaRegisterFunctionReadNamedArgs(lua, register_f_args); + return luaRegisterFunctionReadNamedArgs(lua, func); } else { - return luaRegisterFunctionReadPositionalArgs(lua, register_f_args); + return luaRegisterFunctionReadPositionalArgs(lua, func); } } static int luaRegisterFunction(lua_State *lua) { - registerFunctionArgs register_f_args = {0}; + compiledFunction *func = zcalloc(sizeof(*func)); loadCtx *load_ctx = luaGetFromRegistry(lua, REGISTRY_LOAD_CTX_NAME); if (!load_ctx) { + zfree(func); luaPushError(lua, "server.register_function can only be called on FUNCTION LOAD command"); return luaError(lua); } - if (luaRegisterFunctionReadArgs(lua, ®ister_f_args) != C_OK) { + if (luaRegisterFunctionReadArgs(lua, func) != C_OK) { + zfree(func); return luaError(lua); } - sds err = NULL; - if (functionLibCreateFunction(register_f_args.name, register_f_args.lua_f_ctx, load_ctx->li, register_f_args.desc, - register_f_args.f_flags, &err) != C_OK) { - luaRegisterFunctionArgsDispose(lua, ®ister_f_args); - luaPushError(lua, err); - sdsfree(err); - return luaError(lua); - } + listAddNodeTail(load_ctx->functions, func); return 0; } @@ -494,16 +548,17 @@ int luaEngineInitEngine(void) { lua_enablereadonlytable(lua_engine_ctx->lua, -1, 1); /* protect the new global table */ lua_replace(lua_engine_ctx->lua, LUA_GLOBALSINDEX); /* set new global table as the new globals */ - - engine *lua_engine = zmalloc(sizeof(*lua_engine)); - *lua_engine = (engine){ - .engine_ctx = lua_engine_ctx, - .create = luaEngineCreate, - .call = luaEngineCall, - .get_used_memory = luaEngineGetUsedMemoy, + engineMethods lua_engine_methods = { + .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION, + .create_functions_library = luaEngineCreate, + .call_function = luaEngineCall, .get_function_memory_overhead = luaEngineFunctionMemoryOverhead, - .get_engine_memory_overhead = luaEngineMemoryOverhead, .free_function = luaEngineFreeFunction, + .get_memory_info = luaEngineGetMemoryInfo, }; - return functionsRegisterEngine(LUA_ENGINE_NAME, lua_engine); + + return functionsRegisterEngine(LUA_ENGINE_NAME, + NULL, + lua_engine_ctx, + &lua_engine_methods); } diff --git a/src/functions.c b/src/functions.c index feb82d4ab7..0d003f7fac 100644 --- a/src/functions.c +++ b/src/functions.c @@ -31,6 +31,7 @@ #include "sds.h" #include "dict.h" #include "adlist.h" +#include "module.h" #define LOAD_TIMEOUT_MS 500 @@ -117,9 +118,28 @@ static dict *engines = NULL; /* Libraries Ctx. */ static functionsLibCtx *curr_functions_lib_ctx = NULL; +static void setupEngineModuleCtx(engineInfo *ei, client *c) { + if (ei->engineModule != NULL) { + serverAssert(ei->module_ctx != NULL); + moduleScriptingEngineInitContext(ei->module_ctx, ei->engineModule, c); + } +} + +static void teardownEngineModuleCtx(engineInfo *ei) { + if (ei->engineModule != NULL) { + serverAssert(ei->module_ctx != NULL); + moduleFreeContext(ei->module_ctx); + } +} + static size_t functionMallocSize(functionInfo *fi) { - return zmalloc_size(fi) + sdsAllocSize(fi->name) + (fi->desc ? sdsAllocSize(fi->desc) : 0) + - fi->li->ei->engine->get_function_memory_overhead(fi->function); + setupEngineModuleCtx(fi->li->ei, NULL); + size_t size = zmalloc_size(fi) + + sdsAllocSize(fi->name) + + (fi->desc ? sdsAllocSize(fi->desc) : 0) + + fi->li->ei->engine->get_function_memory_overhead(fi->li->ei->module_ctx, fi->function); + teardownEngineModuleCtx(fi->li->ei); + return size; } static size_t libraryMallocSize(functionLibInfo *li) { @@ -141,8 +161,12 @@ static void engineFunctionDispose(void *obj) { if (fi->desc) { sdsfree(fi->desc); } + setupEngineModuleCtx(fi->li->ei, NULL); engine *engine = fi->li->ei->engine; - engine->free_function(engine->engine_ctx, fi->function); + engine->free_function(fi->li->ei->module_ctx, + engine->engine_ctx, + fi->function); + teardownEngineModuleCtx(fi->li->ei); zfree(fi); } @@ -233,6 +257,15 @@ functionsLibCtx *functionsLibCtxCreate(void) { return ret; } +void functionsAddEngineStats(engineInfo *ei) { + serverAssert(curr_functions_lib_ctx != NULL); + dictEntry *entry = dictFind(curr_functions_lib_ctx->engines_stats, ei->name); + if (entry == NULL) { + functionsLibEngineStats *stats = zcalloc(sizeof(*stats)); + dictAdd(curr_functions_lib_ctx->engines_stats, ei->name, stats); + } +} + /* * Creating a function inside the given library. * On success, return C_OK. @@ -242,24 +275,34 @@ functionsLibCtx *functionsLibCtxCreate(void) { * the function will verify that the given name is following the naming format * and return an error if its not. */ -int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err) { - if (functionsVerifyName(name) != C_OK) { - *err = sdsnew("Library names can only contain letters, numbers, or underscores(_) and must be at least one " - "character long"); +static int functionLibCreateFunction(robj *name, + void *function, + functionLibInfo *li, + robj *desc, + uint64_t f_flags, + sds *err) { + serverAssert(name->type == OBJ_STRING); + serverAssert(desc == NULL || desc->type == OBJ_STRING); + + if (functionsVerifyName(name->ptr) != C_OK) { + *err = sdsnew("Function names can only contain letters, numbers, or " + "underscores(_) and must be at least one character long"); return C_ERR; } - if (dictFetchValue(li->functions, name)) { + sds name_sds = sdsdup(name->ptr); + if (dictFetchValue(li->functions, name_sds)) { *err = sdsnew("Function already exists in the library"); + sdsfree(name_sds); return C_ERR; } functionInfo *fi = zmalloc(sizeof(*fi)); *fi = (functionInfo){ - .name = name, + .name = name_sds, .function = function, .li = li, - .desc = desc, + .desc = desc ? sdsdup(desc->ptr) : NULL, .f_flags = f_flags, }; @@ -403,11 +446,24 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l return ret; } -/* Register an engine, should be called once by the engine on startup and give the following: +/* Register an engine, should be called once by the engine on startup and give + * the following: * * - engine_name - name of the engine to register - * - engine_ctx - the engine ctx that should be used by the server to interact with the engine */ -int functionsRegisterEngine(const char *engine_name, engine *engine) { + * + * - engine_module - the valkey module that implements this engine + * + * - engine_ctx - the engine ctx that should be used by the server to interact + * with the engine. + * + * - engine_methods - the struct with the scripting engine callback functions + * pointers. + * + */ +int functionsRegisterEngine(const char *engine_name, + ValkeyModule *engine_module, + engineCtx *engine_ctx, + engineMethods *engine_methods) { sds engine_name_sds = sdsnew(engine_name); if (dictFetchValue(engines, engine_name_sds)) { serverLog(LL_WARNING, "Same engine was registered twice"); @@ -415,6 +471,16 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) { return C_ERR; } + engine *eng = zmalloc(sizeof(engine)); + *eng = (engine){ + .engine_ctx = engine_ctx, + .create = engine_methods->create_functions_library, + .call = engine_methods->call_function, + .get_function_memory_overhead = engine_methods->get_function_memory_overhead, + .free_function = engine_methods->free_function, + .get_memory_info = engine_methods->get_memory_info, + }; + client *c = createClient(NULL); c->flag.deny_blocking = 1; c->flag.script = 1; @@ -422,15 +488,64 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) { engineInfo *ei = zmalloc(sizeof(*ei)); *ei = (engineInfo){ .name = engine_name_sds, - .engine = engine, + .engineModule = engine_module, + .module_ctx = engine_module ? moduleAllocateContext() : NULL, + .engine = eng, .c = c, }; dictAdd(engines, engine_name_sds, ei); - engine_cache_memory += zmalloc_size(ei) + sdsAllocSize(ei->name) + zmalloc_size(engine) + - engine->get_engine_memory_overhead(engine->engine_ctx); + functionsAddEngineStats(ei); + + setupEngineModuleCtx(ei, NULL); + engineMemoryInfo mem_info = eng->get_memory_info(ei->module_ctx, + eng->engine_ctx); + engine_cache_memory += zmalloc_size(ei) + + sdsAllocSize(ei->name) + + zmalloc_size(eng) + + mem_info.engine_memory_overhead; + + teardownEngineModuleCtx(ei); + + return C_OK; +} + +/* Removes a scripting engine from the server. + * + * - engine_name - name of the engine to remove + */ +int functionsUnregisterEngine(const char *engine_name) { + sds engine_name_sds = sdsnew(engine_name); + dictEntry *entry = dictFind(engines, engine_name_sds); + if (entry == NULL) { + serverLog(LL_WARNING, "There's no engine registered with name %s", engine_name); + sdsfree(engine_name_sds); + return C_ERR; + } + + engineInfo *ei = dictGetVal(entry); + + dictIterator *iter = dictGetSafeIterator(curr_functions_lib_ctx->libraries); + while ((entry = dictNext(iter))) { + functionLibInfo *li = dictGetVal(entry); + if (li->ei == ei) { + libraryUnlink(curr_functions_lib_ctx, li); + engineLibraryFree(li); + } + } + dictReleaseIterator(iter); + + zfree(ei->engine); + sdsfree(ei->name); + freeClient(ei->c); + if (ei->engineModule != NULL) { + serverAssert(ei->module_ctx != NULL); + zfree(ei->module_ctx); + } + zfree(ei); + sdsfree(engine_name_sds); return C_OK; } @@ -649,11 +764,19 @@ static void fcallCommandGeneric(client *c, int ro) { } scriptRunCtx run_ctx; - if (scriptPrepareForRun(&run_ctx, fi->li->ei->c, c, fi->name, fi->f_flags, ro) != C_OK) return; - - engine->call(&run_ctx, engine->engine_ctx, fi->function, c->argv + 3, numkeys, c->argv + 3 + numkeys, + setupEngineModuleCtx(fi->li->ei, run_ctx.original_client); + + engine->call(fi->li->ei->module_ctx, + engine->engine_ctx, + &run_ctx, + fi->function, + c->argv + 3, + numkeys, + c->argv + 3 + numkeys, c->argc - 3 - numkeys); + + teardownEngineModuleCtx(fi->li->ei); scriptResetRun(&run_ctx); } @@ -953,14 +1076,40 @@ void functionFreeLibMetaData(functionsLibMetaData *md) { if (md->engine) sdsfree(md->engine); } +static void freeCompiledFunctions(engineInfo *ei, + compiledFunction **compiled_functions, + size_t num_compiled_functions, + size_t free_function_from_idx) { + setupEngineModuleCtx(ei, NULL); + + for (size_t i = 0; i < num_compiled_functions; i++) { + compiledFunction *func = compiled_functions[i]; + decrRefCount(func->name); + if (func->desc) { + decrRefCount(func->desc); + } + if (i >= free_function_from_idx) { + ei->engine->free_function(ei->module_ctx, + ei->engine->engine_ctx, + func->function); + } + zfree(func); + } + + zfree(compiled_functions); + + teardownEngineModuleCtx(ei); +} + /* Compile and save the given library, return the loaded library name on success * and NULL on failure. In case on failure the err out param is set with relevant error message */ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout) { dictIterator *iter = NULL; dictEntry *entry = NULL; - functionLibInfo *new_li = NULL; functionLibInfo *old_li = NULL; functionsLibMetaData md = {0}; + functionLibInfo *new_li = NULL; + if (functionExtractLibMetaData(code, &md, err) != C_OK) { return NULL; } @@ -990,10 +1139,47 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC } new_li = engineLibraryCreate(md.name, ei, code); - if (engine->create(engine->engine_ctx, new_li, md.code, timeout, err) != C_OK) { + size_t num_compiled_functions = 0; + char *compile_error = NULL; + setupEngineModuleCtx(ei, NULL); + compiledFunction **compiled_functions = + engine->create(ei->module_ctx, + engine->engine_ctx, + md.code, + timeout, + &num_compiled_functions, + &compile_error); + teardownEngineModuleCtx(ei); + if (compiled_functions == NULL) { + serverAssert(num_compiled_functions == 0); + serverAssert(compile_error != NULL); + *err = sdsnew(compile_error); + zfree(compile_error); goto error; } + for (size_t i = 0; i < num_compiled_functions; i++) { + compiledFunction *func = compiled_functions[i]; + int ret = functionLibCreateFunction(func->name, + func->function, + new_li, + func->desc, + func->f_flags, + err); + if (ret == C_ERR) { + freeCompiledFunctions(ei, + compiled_functions, + num_compiled_functions, + i); + goto error; + } + } + + freeCompiledFunctions(ei, + compiled_functions, + num_compiled_functions, + num_compiled_functions); + if (dictSize(new_li->functions) == 0) { *err = sdsnew("No functions registered"); goto error; @@ -1063,6 +1249,7 @@ void functionLoadCommand(client *c) { timeout = 0; } if (!(library_name = functionsCreateWithLibraryCtx(code->ptr, replace, &err, curr_functions_lib_ctx, timeout))) { + serverAssert(err != NULL); addReplyErrorSds(c, err); return; } @@ -1080,7 +1267,11 @@ unsigned long functionsMemory(void) { while ((entry = dictNext(iter))) { engineInfo *ei = dictGetVal(entry); engine *engine = ei->engine; - engines_memory += engine->get_used_memory(engine->engine_ctx); + setupEngineModuleCtx(ei, NULL); + engineMemoryInfo mem_info = engine->get_memory_info(ei->module_ctx, + engine->engine_ctx); + engines_memory += mem_info.used_memory; + teardownEngineModuleCtx(ei); } dictReleaseIterator(iter); @@ -1120,12 +1311,11 @@ size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx) { int functionsInit(void) { engines = dictCreate(&engineDictType); + curr_functions_lib_ctx = functionsLibCtxCreate(); + if (luaEngineInitEngine() != C_OK) { return C_ERR; } - /* Must be initialized after engines initialization */ - curr_functions_lib_ctx = functionsLibCtxCreate(); - return C_OK; } diff --git a/src/functions.h b/src/functions.h index b199fbd06e..89e39fdc56 100644 --- a/src/functions.h +++ b/src/functions.h @@ -54,53 +54,68 @@ typedef struct functionLibInfo functionLibInfo; +/* ValkeyModule type aliases for scripting engine structs and types. */ +typedef ValkeyModuleScriptingEngineCtx engineCtx; +typedef ValkeyModuleScriptingEngineFunctionCtx functionCtx; +typedef ValkeyModuleScriptingEngineCompiledFunction compiledFunction; +typedef ValkeyModuleScriptingEngineMemoryInfo engineMemoryInfo; +typedef ValkeyModuleScriptingEngineMethods engineMethods; + typedef struct engine { /* engine specific context */ - void *engine_ctx; - - /* Create function callback, get the engine_ctx, and function code - * engine_ctx - opaque struct that was created on engine initialization - * li - library information that need to be provided and when add functions - * code - the library code - * timeout - timeout for the library creation (0 for no timeout) - * err - description of error (if occurred) - * returns C_ERR on error and set err to be the error message */ - int (*create)(void *engine_ctx, functionLibInfo *li, sds code, size_t timeout, sds *err); - - /* Invoking a function, r_ctx is an opaque object (from engine POV). - * The r_ctx should be used by the engine to interaction with the server, + engineCtx *engine_ctx; + + /* Compiles the script code and returns an array of compiled functions + * registered in the script./ + * + * Returns NULL on error and set err to be the error message */ + compiledFunction **(*create)( + ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + char **err); + + /* Invoking a function, func_ctx is an opaque object (from engine POV). + * The func_ctx should be used by the engine to interaction with the server, * such interaction could be running commands, set resp, or set * replication mode */ - void (*call)(scriptRunCtx *r_ctx, - void *engine_ctx, + void (*call)(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + functionCtx *func_ctx, void *compiled_function, robj **keys, size_t nkeys, robj **args, size_t nargs); - /* get current used memory by the engine */ - size_t (*get_used_memory)(void *engine_ctx); + /* free the given function */ + void (*free_function)(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + void *compiled_function); /* Return memory overhead for a given function, * such memory is not counted as engine memory but as general * structs memory that hold different information */ - size_t (*get_function_memory_overhead)(void *compiled_function); + size_t (*get_function_memory_overhead)(ValkeyModuleCtx *module_ctx, + void *compiled_function); - /* Return memory overhead for engine (struct size holding the engine)*/ - size_t (*get_engine_memory_overhead)(void *engine_ctx); + /* Get the current used memory by the engine */ + engineMemoryInfo (*get_memory_info)(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx); - /* free the given function */ - void (*free_function)(void *engine_ctx, void *compiled_function); } engine; /* Hold information about an engine. * Used on rdb.c so it must be declared here. */ typedef struct engineInfo { - sds name; /* Name of the engine */ - engine *engine; /* engine callbacks that allows to interact with the engine */ - client *c; /* Client that is used to run commands */ + sds name; /* Name of the engine */ + ValkeyModule *engineModule; /* the module that implements the scripting engine */ + ValkeyModuleCtx *module_ctx; /* Scripting engine module context */ + engine *engine; /* engine callbacks that allows to interact with the engine */ + client *c; /* Client that is used to run commands */ } engineInfo; /* Hold information about the specific function. @@ -123,7 +138,12 @@ struct functionLibInfo { sds code; /* Library code */ }; -int functionsRegisterEngine(const char *engine_name, engine *engine_ctx); +int functionsRegisterEngine(const char *engine_name, + ValkeyModule *engine_module, + void *engine_ctx, + engineMethods *engine_methods); +int functionsUnregisterEngine(const char *engine_name); + sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout); unsigned long functionsMemory(void); unsigned long functionsMemoryOverhead(void); @@ -138,8 +158,6 @@ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx); void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *)); void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async); -int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err); - int luaEngineInitEngine(void); int functionsInit(void); diff --git a/src/module.c b/src/module.c index 541ae490ab..db493dd8bc 100644 --- a/src/module.c +++ b/src/module.c @@ -62,6 +62,7 @@ #include "crc16_slottable.h" #include "valkeymodule.h" #include "io_threads.h" +#include "functions.h" #include #include #include @@ -879,6 +880,15 @@ void moduleCallCommandUnblockedHandler(client *c) { moduleReleaseTempClient(c); } +/* Allocates the memory necessary to hold the ValkeyModuleCtx structure, and + * returns the pointer to the allocated memory. + * + * Used by the scripting engines implementation to cache the context structure. + */ +ValkeyModuleCtx *moduleAllocateContext(void) { + return (ValkeyModuleCtx *)zcalloc(sizeof(ValkeyModuleCtx)); +} + /* Create a module ctx and keep track of the nesting level. * * Note: When creating ctx for threads (VM_GetThreadSafeContext and @@ -921,6 +931,16 @@ void moduleCreateContext(ValkeyModuleCtx *out_ctx, ValkeyModule *module, int ctx } } +/* Initialize a module context to be used by scripting engines callback + * functions. + */ +void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx, + ValkeyModule *module, + client *client) { + moduleCreateContext(out_ctx, module, VALKEYMODULE_CTX_NONE); + out_ctx->client = client; +} + /* This command binds the normal command invocation with commands * exported by modules. */ void ValkeyModuleCommandDispatcher(client *c) { @@ -13074,6 +13094,60 @@ int VM_RdbSave(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) { return VALKEYMODULE_OK; } +/* Registers a new scripting engine in the server. + * + * - `module_ctx`: the module context object. + * + * - `engine_name`: the name of the scripting engine. This name will match + * against the engine name specified in the script header using a shebang. + * + * - `engine_ctx`: engine specific context pointer. + * + * - `engine_methods`: the struct with the scripting engine callback functions + * pointers. + * + * Returns VALKEYMODULE_OK if the engine is successfully registered, and + * VALKEYMODULE_ERR in case some failure occurs. In case of a failure, an error + * message is logged. + */ +int VM_RegisterScriptingEngine(ValkeyModuleCtx *module_ctx, + const char *engine_name, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineMethods *engine_methods) { + serverLog(LL_DEBUG, "Registering a new scripting engine: %s", engine_name); + + if (engine_methods->version > VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION) { + serverLog(LL_WARNING, "The engine implementation version is greater " + "than what this server supports. Server ABI " + "Version: %lu, Engine ABI version: %lu", + VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION, + (unsigned long)engine_methods->version); + return VALKEYMODULE_ERR; + } + + if (functionsRegisterEngine(engine_name, + module_ctx->module, + engine_ctx, + engine_methods) != C_OK) { + return VALKEYMODULE_ERR; + } + + return VALKEYMODULE_OK; +} + +/* Removes the scripting engine from the server. + * + * `engine_name` is the name of the scripting engine. + * + * Returns VALKEYMODULE_OK. + * + */ +int VM_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name) { + UNUSED(ctx); + functionsUnregisterEngine(engine_name); + return VALKEYMODULE_OK; +} + /* MODULE command. * * MODULE LIST @@ -13944,4 +14018,6 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(RdbStreamFree); REGISTER_API(RdbLoad); REGISTER_API(RdbSave); + REGISTER_API(RegisterScriptingEngine); + REGISTER_API(UnregisterScriptingEngine); } diff --git a/src/module.h b/src/module.h new file mode 100644 index 0000000000..f61ef1e3cb --- /dev/null +++ b/src/module.h @@ -0,0 +1,17 @@ +#ifndef _MODULE_H_ +#define _MODULE_H_ + +/* This header file exposes a set of functions defined in module.c that are + * not part of the module API, but are used by the core to interact with modules + */ + +typedef struct ValkeyModuleCtx ValkeyModuleCtx; +typedef struct ValkeyModule ValkeyModule; + +ValkeyModuleCtx *moduleAllocateContext(void); +void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx, + ValkeyModule *module, + client *client); +void moduleFreeContext(ValkeyModuleCtx *ctx); + +#endif /* _MODULE_H_ */ diff --git a/src/script.h b/src/script.h index 7fff34a40b..194cc8bd05 100644 --- a/src/script.h +++ b/src/script.h @@ -67,6 +67,8 @@ #define SCRIPT_ALLOW_CROSS_SLOT (1ULL << 8) /* Indicate that the current script may access keys from multiple slots */ typedef struct scriptRunCtx scriptRunCtx; +/* This struct stores the necessary information to manage the execution of + * scripts using EVAL and FCALL. */ struct scriptRunCtx { const char *funcname; client *c; diff --git a/src/script_lua.c b/src/script_lua.c index 5093fa944f..29d352d44b 100644 --- a/src/script_lua.c +++ b/src/script_lua.c @@ -1258,15 +1258,15 @@ static void luaLoadLibraries(lua_State *lua) { /* Return sds of the string value located on stack at the given index. * Return NULL if the value is not a string. */ -sds luaGetStringSds(lua_State *lua, int index) { +robj *luaGetStringObject(lua_State *lua, int index) { if (!lua_isstring(lua, index)) { return NULL; } size_t len; const char *str = lua_tolstring(lua, index, &len); - sds str_sds = sdsnewlen(str, len); - return str_sds; + robj *str_obj = createStringObject(str, len); + return str_obj; } static int luaProtectedTableError(lua_State *lua) { diff --git a/src/script_lua.h b/src/script_lua.h index 35edf46af6..6c60754bbc 100644 --- a/src/script_lua.h +++ b/src/script_lua.h @@ -67,7 +67,7 @@ typedef struct errorInfo { } errorInfo; void luaRegisterServerAPI(lua_State *lua); -sds luaGetStringSds(lua_State *lua, int index); +robj *luaGetStringObject(lua_State *lua, int index); void luaRegisterGlobalProtectionFunction(lua_State *lua); void luaSetErrorMetatable(lua_State *lua); void luaSetAllowListProtection(lua_State *lua); diff --git a/src/util.c b/src/util.c index 6d99d47e5a..6e44392ce1 100644 --- a/src/util.c +++ b/src/util.c @@ -50,6 +50,7 @@ #include "util.h" #include "sha256.h" #include "config.h" +#include "zmalloc.h" #include "valkey_strtod.h" @@ -1380,3 +1381,23 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...) { va_end(args); return result; } + +/* A printf-like function that returns a freshly allocated string. + * + * This function is similar to asprintf function, but it uses zmalloc for + * allocating the string buffer. */ +char *valkey_asprintf(char const *fmt, ...) { + va_list args; + + va_start(args, fmt); + size_t str_len = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); + + char *str = zmalloc(str_len); + + va_start(args, fmt); + vsnprintf(str, str_len, fmt, args); + va_end(args); + + return str; +} diff --git a/src/util.h b/src/util.h index 51eb38f0b4..61095ddb65 100644 --- a/src/util.h +++ b/src/util.h @@ -99,5 +99,6 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...); #endif size_t valkey_strlcpy(char *dst, const char *src, size_t dsize); size_t valkey_strlcat(char *dst, const char *src, size_t dsize); +char *valkey_asprintf(char const *fmt, ...); #endif diff --git a/src/valkeymodule.h b/src/valkeymodule.h index 7c3adfd477..1d99d2ff7a 100644 --- a/src/valkeymodule.h +++ b/src/valkeymodule.h @@ -783,6 +783,7 @@ typedef enum { } ValkeyModuleACLLogEntryReason; /* Incomplete structures needed by both the core and modules. */ +typedef struct ValkeyModuleCtx ValkeyModuleCtx; typedef struct ValkeyModuleIO ValkeyModuleIO; typedef struct ValkeyModuleDigest ValkeyModuleDigest; typedef struct ValkeyModuleInfoCtx ValkeyModuleInfoCtx; @@ -794,6 +795,93 @@ typedef void (*ValkeyModuleInfoFunc)(ValkeyModuleInfoCtx *ctx, int for_crash_rep typedef void (*ValkeyModuleDefragFunc)(ValkeyModuleDefragCtx *ctx); typedef void (*ValkeyModuleUserChangedFunc)(uint64_t client_id, void *privdata); +/* Current ABI version for scripting engine modules. */ +#define VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION 1UL + +/* Type definitions for implementing scripting engines modules. */ +typedef void ValkeyModuleScriptingEngineCtx; +typedef void ValkeyModuleScriptingEngineFunctionCtx; + +/* This struct represents a scripting engine function that results from the + * compilation of a script by the engine implementation. + * + * IMPORTANT: If we ever need to add/remove fields from this struct, we need + * to bump the version number defined in the + * `VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION` constant. + */ +typedef struct ValkeyModuleScriptingEngineCompiledFunction { + ValkeyModuleString *name; /* Function name */ + void *function; /* Opaque object representing a function, usually it' + the function compiled code. */ + ValkeyModuleString *desc; /* Function description */ + uint64_t f_flags; /* Function flags */ +} ValkeyModuleScriptingEngineCompiledFunction; + +/* This struct is used to return the memory information of the scripting + * engine. */ +typedef struct ValkeyModuleScriptingEngineMemoryInfo { + /* The memory used by the scripting engine runtime. */ + size_t used_memory; + /* The memory used by the scripting engine data structures. */ + size_t engine_memory_overhead; +} ValkeyModuleScriptingEngineMemoryInfo; + +typedef ValkeyModuleScriptingEngineCompiledFunction **(*ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc)( + ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + char **err); + +typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)( + ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineFunctionCtx *func_ctx, + void *compiled_function, + ValkeyModuleString **keys, + size_t nkeys, + ValkeyModuleString **args, + size_t nargs); + +typedef size_t (*ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc)( + ValkeyModuleCtx *module_ctx, + void *compiled_function); + +typedef void (*ValkeyModuleScriptingEngineFreeFunctionFunc)( + ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + void *compiled_function); + +typedef ValkeyModuleScriptingEngineMemoryInfo (*ValkeyModuleScriptingEngineGetMemoryInfoFunc)( + ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx); + +typedef struct ValkeyModuleScriptingEngineMethodsV1 { + uint64_t version; /* Version of this structure for ABI compat. */ + + /* Library create function callback. When a new script is loaded, this + * callback will be called with the script code, and returns a list of + * ValkeyModuleScriptingEngineCompiledFunc objects. */ + ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc create_functions_library; + + /* Function callback to free the memory of a registered engine function. */ + ValkeyModuleScriptingEngineFreeFunctionFunc free_function; + + /* The callback function called when `FCALL` command is called on a function + * registered in this engine. */ + ValkeyModuleScriptingEngineCallFunctionFunc call_function; + + /* Function callback to return memory overhead for a given function. */ + ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc get_function_memory_overhead; + + /* Function callback to get the used memory by the engine. */ + ValkeyModuleScriptingEngineGetMemoryInfoFunc get_memory_info; + +} ValkeyModuleScriptingEngineMethodsV1; + +#define ValkeyModuleScriptingEngineMethods ValkeyModuleScriptingEngineMethodsV1 + /* ------------------------- End of common defines ------------------------ */ /* ----------- The rest of the defines are only for modules ----------------- */ @@ -826,7 +914,6 @@ typedef void (*ValkeyModuleUserChangedFunc)(uint64_t client_id, void *privdata); #endif /* Incomplete structures for compiler checks but opaque access. */ -typedef struct ValkeyModuleCtx ValkeyModuleCtx; typedef struct ValkeyModuleCommand ValkeyModuleCommand; typedef struct ValkeyModuleCallReply ValkeyModuleCallReply; typedef struct ValkeyModuleType ValkeyModuleType; @@ -1650,6 +1737,14 @@ VALKEYMODULE_API int (*ValkeyModule_RdbSave)(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) VALKEYMODULE_ATTR; +VALKEYMODULE_API int (*ValkeyModule_RegisterScriptingEngine)(ValkeyModuleCtx *module_ctx, + const char *engine_name, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineMethods *engine_methods) VALKEYMODULE_ATTR; + +VALKEYMODULE_API int (*ValkeyModule_UnregisterScriptingEngine)(ValkeyModuleCtx *module_ctx, + const char *engine_name) VALKEYMODULE_ATTR; + #define ValkeyModule_IsAOFClient(id) ((id) == UINT64_MAX) /* This is included inline inside each Valkey module. */ @@ -2017,6 +2112,8 @@ static int ValkeyModule_Init(ValkeyModuleCtx *ctx, const char *name, int ver, in VALKEYMODULE_GET_API(RdbStreamFree); VALKEYMODULE_GET_API(RdbLoad); VALKEYMODULE_GET_API(RdbSave); + VALKEYMODULE_GET_API(RegisterScriptingEngine); + VALKEYMODULE_GET_API(UnregisterScriptingEngine); if (ValkeyModule_IsModuleNameBusy && ValkeyModule_IsModuleNameBusy(name)) return VALKEYMODULE_ERR; ValkeyModule_SetModuleAttribs(ctx, name, ver, apiver); diff --git a/tests/modules/CMakeLists.txt b/tests/modules/CMakeLists.txt index 0cac0c4cb6..e98a878c9d 100644 --- a/tests/modules/CMakeLists.txt +++ b/tests/modules/CMakeLists.txt @@ -40,6 +40,7 @@ list(APPEND MODULES_LIST "moduleauthtwo") list(APPEND MODULES_LIST "rdbloadsave") list(APPEND MODULES_LIST "crash") list(APPEND MODULES_LIST "cluster") +list(APPEND MODULES_LIST "helloscripting") foreach (MODULE_NAME ${MODULES_LIST}) message(STATUS "Building test module: ${MODULE_NAME}") diff --git a/tests/modules/Makefile b/tests/modules/Makefile index 82813bb6f7..963546a9ff 100644 --- a/tests/modules/Makefile +++ b/tests/modules/Makefile @@ -65,7 +65,8 @@ TEST_MODULES = \ moduleauthtwo.so \ rdbloadsave.so \ crash.so \ - cluster.so + cluster.so \ + helloscripting.so .PHONY: all diff --git a/tests/modules/helloscripting.c b/tests/modules/helloscripting.c new file mode 100644 index 0000000000..fdca6c8e91 --- /dev/null +++ b/tests/modules/helloscripting.c @@ -0,0 +1,383 @@ +#include "valkeymodule.h" + +#include +#include +#include + +/* + * This module implements a very simple stack based scripting language. + * It's purpose is only to test the valkey module API to implement scripting + * engines. + * + * The language is called HELLO, and a program in this language is formed by + * a list of function definitions. + * The language only supports 32-bit integer, and it only allows to return an + * integer constant, or return the value passed as the first argument to the + * function. + * + * Example of a program: + * + * ``` + * FUNCTION foo # declaration of function 'foo' + * ARGS 0 # pushes the value in the first argument to the top of the + * # stack + * RETURN # returns the current value on the top of the stack and marks + * # the end of the function declaration + * + * FUNCTION bar # declaration of function 'bar' + * CONSTI 432 # pushes the value 432 to the top of the stack + * RETURN # returns the current value on the top of the stack and marks + * # the end of the function declaration. + * ``` + */ + +/* + * List of instructions of the HELLO language. + */ +typedef enum HelloInstKind { + FUNCTION = 0, + CONSTI, + ARGS, + RETURN, + _NUM_INSTRUCTIONS, // Not a real instruction. +} HelloInstKind; + +/* + * String representations of the instructions above. + */ +const char *HelloInstKindStr[] = { + "FUNCTION", + "CONSTI", + "ARGS", + "RETURN", +}; + +/* + * Struct that represents an instance of an instruction. + * Instructions may have at most one parameter. + */ +typedef struct HelloInst { + HelloInstKind kind; + union { + uint32_t integer; + const char *string; + } param; +} HelloInst; + +/* + * Struct that represents an instance of a function. + * A function is just a list of instruction instances. + */ +typedef struct HelloFunc { + char *name; + HelloInst instructions[256]; + uint32_t num_instructions; +} HelloFunc; + +/* + * Struct that represents an instance of an HELLO program. + * A program is just a list of function instances. + */ +typedef struct HelloProgram { + HelloFunc *functions[16]; + uint32_t num_functions; +} HelloProgram; + +/* + * Struct that represents the runtime context of an HELLO program. + */ +typedef struct HelloLangCtx { + HelloProgram *program; +} HelloLangCtx; + + +static HelloLangCtx *hello_ctx = NULL; + + +static uint32_t str2int(const char *str) { + char *end; + errno = 0; + uint32_t val = (uint32_t)strtoul(str, &end, 10); + ValkeyModule_Assert(errno == 0); + return val; +} + +/* + * Parses the kind of instruction that the current token points to. + */ +static HelloInstKind helloLangParseInstruction(const char *token) { + for (HelloInstKind i = 0; i < _NUM_INSTRUCTIONS; i++) { + if (strcmp(HelloInstKindStr[i], token) == 0) { + return i; + } + } + return _NUM_INSTRUCTIONS; +} + +/* + * Parses the function param. + */ +static void helloLangParseFunction(HelloFunc *func) { + char *token = strtok(NULL, " \n"); + ValkeyModule_Assert(token != NULL); + func->name = ValkeyModule_Alloc(sizeof(char) * strlen(token) + 1); + strcpy(func->name, token); +} + +/* + * Parses an integer parameter. + */ +static void helloLangParseIntegerParam(HelloFunc *func) { + char *token = strtok(NULL, " \n"); + func->instructions[func->num_instructions].param.integer = str2int(token); +} + +/* + * Parses the CONSTI instruction parameter. + */ +static void helloLangParseConstI(HelloFunc *func) { + helloLangParseIntegerParam(func); + func->num_instructions++; +} + +/* + * Parses the ARGS instruction parameter. + */ +static void helloLangParseArgs(HelloFunc *func) { + helloLangParseIntegerParam(func); + func->num_instructions++; +} + +/* + * Parses an HELLO program source code. + */ +static HelloProgram *helloLangParseCode(const char *code, + HelloProgram *program) { + char *_code = ValkeyModule_Alloc(sizeof(char) * strlen(code) + 1); + strcpy(_code, code); + + HelloFunc *currentFunc = NULL; + + char *token = strtok(_code, " \n"); + while (token != NULL) { + HelloInstKind kind = helloLangParseInstruction(token); + + if (currentFunc != NULL) { + currentFunc->instructions[currentFunc->num_instructions].kind = kind; + } + + switch (kind) { + case FUNCTION: + ValkeyModule_Assert(currentFunc == NULL); + currentFunc = ValkeyModule_Alloc(sizeof(HelloFunc)); + memset(currentFunc, 0, sizeof(HelloFunc)); + program->functions[program->num_functions++] = currentFunc; + helloLangParseFunction(currentFunc); + break; + case CONSTI: + ValkeyModule_Assert(currentFunc != NULL); + helloLangParseConstI(currentFunc); + break; + case ARGS: + ValkeyModule_Assert(currentFunc != NULL); + helloLangParseArgs(currentFunc); + break; + case RETURN: + ValkeyModule_Assert(currentFunc != NULL); + currentFunc->num_instructions++; + currentFunc = NULL; + break; + default: + ValkeyModule_Assert(0); + } + + token = strtok(NULL, " \n"); + } + + ValkeyModule_Free(_code); + + return program; +} + +/* + * Executes an HELLO function. + */ +static uint32_t executeHelloLangFunction(HelloFunc *func, + ValkeyModuleString **args, int nargs) { + uint32_t stack[64]; + int sp = 0; + + for (uint32_t pc = 0; pc < func->num_instructions; pc++) { + HelloInst instr = func->instructions[pc]; + switch (instr.kind) { + case CONSTI: + stack[sp++] = instr.param.integer; + break; + case ARGS: + uint32_t idx = instr.param.integer; + ValkeyModule_Assert(idx < (uint32_t)nargs); + size_t len; + const char *argStr = ValkeyModule_StringPtrLen(args[idx], &len); + uint32_t arg = str2int(argStr); + stack[sp++] = arg; + break; + case RETURN: + uint32_t val = stack[--sp]; + ValkeyModule_Assert(sp == 0); + return val; + case FUNCTION: + default: + ValkeyModule_Assert(0); + } + } + + ValkeyModule_Assert(0); + return 0; +} + +static ValkeyModuleScriptingEngineMemoryInfo engineGetMemoryInfo(ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx) { + VALKEYMODULE_NOT_USED(module_ctx); + HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx; + ValkeyModuleScriptingEngineMemoryInfo mem_info = {0}; + + if (ctx->program != NULL) { + mem_info.used_memory += ValkeyModule_MallocSize(ctx->program); + + for (uint32_t i = 0; i < ctx->program->num_functions; i++) { + HelloFunc *func = ctx->program->functions[i]; + mem_info.used_memory += ValkeyModule_MallocSize(func); + mem_info.used_memory += ValkeyModule_MallocSize(func->name); + } + } + + mem_info.engine_memory_overhead = ValkeyModule_MallocSize(ctx); + if (ctx->program != NULL) { + mem_info.engine_memory_overhead += ValkeyModule_MallocSize(ctx->program); + } + + return mem_info; +} + +static size_t engineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx, + void *compiled_function) { + VALKEYMODULE_NOT_USED(module_ctx); + HelloFunc *func = (HelloFunc *)compiled_function; + return ValkeyModule_MallocSize(func->name); +} + +static void engineFreeFunction(ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + void *compiled_function) { + VALKEYMODULE_NOT_USED(module_ctx); + VALKEYMODULE_NOT_USED(engine_ctx); + HelloFunc *func = (HelloFunc *)compiled_function; + ValkeyModule_Free(func->name); + func->name = NULL; + ValkeyModule_Free(func); +} + +static ValkeyModuleScriptingEngineCompiledFunction **createHelloLangEngine(ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + char **err) { + VALKEYMODULE_NOT_USED(module_ctx); + VALKEYMODULE_NOT_USED(timeout); + VALKEYMODULE_NOT_USED(err); + + HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx; + + if (ctx->program == NULL) { + ctx->program = ValkeyModule_Alloc(sizeof(HelloProgram)); + memset(ctx->program, 0, sizeof(HelloProgram)); + } else { + ctx->program->num_functions = 0; + } + + ctx->program = helloLangParseCode(code, ctx->program); + + ValkeyModuleScriptingEngineCompiledFunction **compiled_functions = + ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction *) * ctx->program->num_functions); + + for (uint32_t i = 0; i < ctx->program->num_functions; i++) { + HelloFunc *func = ctx->program->functions[i]; + + ValkeyModuleScriptingEngineCompiledFunction *cfunc = + ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction)); + *cfunc = (ValkeyModuleScriptingEngineCompiledFunction) { + .name = ValkeyModule_CreateString(NULL, func->name, strlen(func->name)), + .function = func, + .desc = NULL, + .f_flags = 0, + }; + + compiled_functions[i] = cfunc; + } + + *out_num_compiled_functions = ctx->program->num_functions; + + return compiled_functions; +} + +static void +callHelloLangFunction(ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineFunctionCtx *func_ctx, + void *compiled_function, + ValkeyModuleString **keys, size_t nkeys, + ValkeyModuleString **args, size_t nargs) { + VALKEYMODULE_NOT_USED(engine_ctx); + VALKEYMODULE_NOT_USED(func_ctx); + VALKEYMODULE_NOT_USED(keys); + VALKEYMODULE_NOT_USED(nkeys); + + HelloFunc *func = (HelloFunc *)compiled_function; + uint32_t result = executeHelloLangFunction(func, args, nargs); + + ValkeyModule_ReplyWithLongLong(module_ctx, result); +} + +int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, + int argc) { + VALKEYMODULE_NOT_USED(argv); + VALKEYMODULE_NOT_USED(argc); + + if (ValkeyModule_Init(ctx, "helloengine", 1, VALKEYMODULE_APIVER_1) == + VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + hello_ctx = ValkeyModule_Alloc(sizeof(HelloLangCtx)); + hello_ctx->program = NULL; + + ValkeyModuleScriptingEngineMethods methods = { + .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION, + .create_functions_library = createHelloLangEngine, + .call_function = callHelloLangFunction, + .get_function_memory_overhead = engineFunctionMemoryOverhead, + .free_function = engineFreeFunction, + .get_memory_info = engineGetMemoryInfo, + }; + + ValkeyModule_RegisterScriptingEngine(ctx, + "HELLO", + hello_ctx, + &methods); + + return VALKEYMODULE_OK; +} + +int ValkeyModule_OnUnload(ValkeyModuleCtx *ctx) { + if (ValkeyModule_UnregisterScriptingEngine(ctx, "HELLO") != VALKEYMODULE_OK) { + ValkeyModule_Log(ctx, "error", "Failed to unregister engine"); + return VALKEYMODULE_ERR; + } + + ValkeyModule_Free(hello_ctx->program); + hello_ctx->program = NULL; + ValkeyModule_Free(hello_ctx); + hello_ctx = NULL; + + return VALKEYMODULE_OK; +} diff --git a/tests/unit/functions.tcl b/tests/unit/functions.tcl index 7ddd36dd7d..1636baaf6d 100644 --- a/tests/unit/functions.tcl +++ b/tests/unit/functions.tcl @@ -604,7 +604,7 @@ start_server {tags {"scripting"}} { } } e set _ $e - } {*Library names can only contain letters, numbers, or underscores(_) and must be at least one character long*} + } {*Function names can only contain letters, numbers, or underscores(_) and must be at least one character long*} test {LIBRARIES - test registration with empty name} { catch { @@ -613,7 +613,7 @@ start_server {tags {"scripting"}} { } } e set _ $e - } {*Library names can only contain letters, numbers, or underscores(_) and must be at least one character long*} + } {*Function names can only contain letters, numbers, or underscores(_) and must be at least one character long*} test {LIBRARIES - math.random from function load} { catch { diff --git a/tests/unit/moduleapi/scriptingengine.tcl b/tests/unit/moduleapi/scriptingengine.tcl new file mode 100644 index 0000000000..c350633dd8 --- /dev/null +++ b/tests/unit/moduleapi/scriptingengine.tcl @@ -0,0 +1,126 @@ +set testmodule [file normalize tests/modules/helloscripting.so] + +set HELLO_PROGRAM "#!hello name=mylib\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION bar\nCONSTI 432\nRETURN" + +start_server {tags {"modules"}} { + r module load $testmodule + + r function load $HELLO_PROGRAM + + test {Load script with invalid library name} { + assert_error {ERR Library names can only contain letters, numbers, or underscores(_) and must be at least one character long} {r function load "#!hello name=my-lib\nFUNCTION foo\nARGS 0\nRETURN"} + } + + test {Load script with existing library} { + assert_error {ERR Library 'mylib' already exists} {r function load $HELLO_PROGRAM} + } + + test {Load script with invalid engine} { + assert_error {ERR Engine 'wasm' not found} {r function load "#!wasm name=mylib2\nFUNCTION foo\nARGS 0\nRETURN"} + } + + test {Load script with no functions} { + assert_error {ERR No functions registered} {r function load "#!hello name=mylib2\n"} + } + + test {Load script with duplicate function} { + assert_error {ERR Function foo already exists} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN"} + } + + test {Load script with no metadata header} { + assert_error {ERR Missing library metadata} {r function load "FUNCTION foo\nARGS 0\nRETURN"} + } + + test {Load script with header without lib name} { + assert_error {ERR Library name was not given} {r function load "#!hello \n"} + } + + test {Load script with header with unknown param} { + assert_error {ERR Invalid metadata value given: nme=mylib} {r function load "#!hello nme=mylib\n"} + } + + test {Load script with header with lib name passed twice} { + assert_error {ERR Invalid metadata value, name argument was given multiple times} {r function load "#!hello name=mylib2 name=mylib3\n"} + } + + test {Load script with invalid function name} { + assert_error {ERR Function names can only contain letters, numbers, or underscores(_) and must be at least one character long} {r function load "#!hello name=mylib2\nFUNCTION foo-bar\nARGS 0\nRETURN"} + } + + test {Load script with duplicate function} { + assert_error {ERR Function already exists in the library} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION foo\nARGS 0\nRETURN"} + } + + test {Call scripting engine function: calling foo works} { + r fcall foo 0 134 + } {134} + + test {Call scripting engine function: calling bar works} { + r fcall bar 0 + } {432} + + test {Replace function library and call functions} { + set result [r function load replace "#!hello name=mylib\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION bar\nCONSTI 500\nRETURN"] + assert_equal $result "mylib" + + set result [r fcall foo 0 132] + assert_equal $result 132 + + set result [r fcall bar 0] + assert_equal $result 500 + } + + test {List scripting engine functions} { + r function load replace "#!hello name=mylib\nFUNCTION foobar\nARGS 0\nRETURN" + r function list + } {{library_name mylib engine HELLO functions {{name foobar description {} flags {}}}}} + + test {Load a second library and call a function} { + r function load "#!hello name=mylib2\nFUNCTION getarg\nARGS 0\nRETURN" + set result [r fcall getarg 0 456] + assert_equal $result 456 + } + + test {Delete all libraries and functions} { + set result [r function flush] + assert_equal $result {OK} + r function list + } {} + + test {Test the deletion of a single library} { + r function load $HELLO_PROGRAM + r function load "#!hello name=mylib2\nFUNCTION getarg\nARGS 0\nRETURN" + + set result [r function delete mylib] + assert_equal $result {OK} + + set result [r fcall getarg 0 446] + assert_equal $result 446 + } + + test {Test dump and restore function library} { + r function load $HELLO_PROGRAM + + set result [r fcall bar 0] + assert_equal $result 432 + + set dump [r function dump] + + set result [r function flush] + assert_equal $result {OK} + + set result [r function restore $dump] + assert_equal $result {OK} + + set result [r fcall getarg 0 436] + assert_equal $result 436 + + set result [r fcall bar 0] + assert_equal $result 432 + } + + test {Unload scripting engine module} { + set result [r module unload helloengine] + assert_equal $result "OK" + } +} From 65d054b3caf96fd5c041f74dd0e004a2c71cd1fa Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 23 Dec 2024 05:57:56 +0800 Subject: [PATCH 033/101] Fix switch case compilation error in the new helloscripting (#1472) It is missing the curly braces for variable declaration after case. Signed-off-by: Binbin --- tests/modules/helloscripting.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/modules/helloscripting.c b/tests/modules/helloscripting.c index fdca6c8e91..c912164bda 100644 --- a/tests/modules/helloscripting.c +++ b/tests/modules/helloscripting.c @@ -213,7 +213,7 @@ static uint32_t executeHelloLangFunction(HelloFunc *func, case CONSTI: stack[sp++] = instr.param.integer; break; - case ARGS: + case ARGS: { uint32_t idx = instr.param.integer; ValkeyModule_Assert(idx < (uint32_t)nargs); size_t len; @@ -221,10 +221,12 @@ static uint32_t executeHelloLangFunction(HelloFunc *func, uint32_t arg = str2int(argStr); stack[sp++] = arg; break; - case RETURN: + } + case RETURN: { uint32_t val = stack[--sp]; ValkeyModule_Assert(sp == 0); return val; + } case FUNCTION: default: ValkeyModule_Assert(0); From 070ad88b158560625ccf90204928f54f3607fac2 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 23 Dec 2024 21:07:15 -0800 Subject: [PATCH 034/101] Remove readability refactor for failover auth to fix clang warning (#1481) As part of #1463, I made a small refactor between the PR and the daily test I submitted to try to improve readability by adding a function to abstract the extraction of the message types. However, that change apparently caused GCC to throw another warning, so reverting the abstraction on just one line. Signed-off-by: Madelyn Olson --- src/cluster_legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 9a23527b30..3d838dfe06 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4361,7 +4361,7 @@ void clusterRequestFailoverAuth(void) { /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit * in the header to communicate the nodes receiving the message that * they should authorized the failover even if the primary is working. */ - if (server.cluster->mf_end) getMessageFromSendBlock(msgblock)->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; + if (server.cluster->mf_end) msgblock->data[0].msg.mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; clusterBroadcastMessage(msgblock); clusterMsgSendBlockDecrRefCount(msgblock); } From c8e5fc94f7ffdfb522c0904725968494abc109a9 Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Tue, 24 Dec 2024 08:13:25 +0200 Subject: [PATCH 035/101] Reduce dual channel testing time (#1477) - By not waiting `repl-diskless-sync-delay` when we don't have to, we can reduce ~30% of dual channel tests execution time. - This commit also drops one test which is not required for regular sync (`Sync should continue if not all slaves dropped`). - Skip dual channel test with master diskless disabled because it will initiate the same synchronization process as the non-dual channel test, making it redundant. Before: ``` Execution time of different units: 171 seconds - integration/dual-channel-replication 305 seconds - integration/replication-psync \o/ All tests passed without errors! ``` After: ``` Execution time of different units: 120 seconds - integration/dual-channel-replication 236 seconds - integration/replication-psync \o/ All tests passed without errors! ``` Discused on https://github.com/valkey-io/valkey/pull/1173 --------- Signed-off-by: naglera --- .../integration/dual-channel-replication.tcl | 60 ++++++++++--------- tests/integration/replication-psync.tcl | 4 ++ 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 8191b9f699..b4b9286d68 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -110,6 +110,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set rdb-key-save-delay 200 $primary config set dual-channel-replication-enabled yes + $primary config set repl-diskless-sync-delay 0 $replica config set dual-channel-replication-enabled yes $replica config set repl-diskless-sync no @@ -201,6 +202,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { # a replication buffer block. $primary config set client-output-buffer-limit "replica 1100k 0 0" $primary config set dual-channel-replication-enabled $enable + $primary config set repl-diskless-sync-delay 0 $replica config set dual-channel-replication-enabled $enable test "Toggle dual-channel-replication-enabled: $enable start" { @@ -506,6 +508,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set dual-channel-replication-enabled yes $primary config set repl-backlog-size $backlog_size $primary config set loglevel debug + $primary config set repl-diskless-sync-delay 0 if {$::valgrind} { $primary config set repl-timeout 100 } else { @@ -877,7 +880,6 @@ start_server {tags {"dual-channel-replication external:skip"}} { } } -foreach dualchannel {yes no} { start_server {tags {"dual-channel-replication external:skip"}} { set primary [srv 0 client] set primary_host [srv 0 host] @@ -893,20 +895,20 @@ start_server {tags {"dual-channel-replication external:skip"}} { # Generating RDB will cost 5s(10000 * 0.0005s) $primary debug populate 10000 primary 1 $primary config set rdb-key-save-delay 500 - $primary config set dual-channel-replication-enabled $dualchannel + $primary config set dual-channel-replication-enabled yes start_server {} { set replica1 [srv 0 client] - $replica1 config set dual-channel-replication-enabled $dualchannel + $replica1 config set dual-channel-replication-enabled yes $replica1 config set loglevel debug start_server {} { set replica2 [srv 0 client] - $replica2 config set dual-channel-replication-enabled $dualchannel + $replica2 config set dual-channel-replication-enabled yes $replica2 config set loglevel debug $replica2 config set repl-timeout 60 set load_handle [start_one_key_write_load $primary_host $primary_port 100 "mykey1"] - test "Sync should continue if not all slaves dropped dual-channel-replication $dualchannel" { + test "Sync should continue if not all slaves dropped" { $replica1 replicaof $primary_host $primary_port $replica2 replicaof $primary_host $primary_port @@ -915,20 +917,17 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Sync did not start" } - if {$dualchannel == "yes"} { - # Wait for both replicas main conns to establish psync - wait_for_condition 50 1000 { - [status $primary sync_partial_ok] == 2 - } else { - fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]" - } + # Wait for both replicas main conns to establish psync + wait_for_condition 50 1000 { + [status $primary sync_partial_ok] == 2 + } else { + fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]" } - catch {$replica1 shutdown nosave} wait_for_condition 50 2000 { [status $replica2 master_link_status] == "up" && [status $primary sync_full] == 2 && - (($dualchannel == "yes" && [status $primary sync_partial_ok] == 2) || $dualchannel == "no") + ([status $primary sync_partial_ok] == 2) } else { fail "Sync session interapted\n sync_full:[status $primary sync_full]\n @@ -942,7 +941,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary debug populate 1000000 primary 1 $primary config set rdb-key-save-delay 100 - test "Primary abort sync if all slaves dropped dual-channel-replication $dualchannel" { + test "Primary abort sync if all slaves dropped dual-channel-replication" { set cur_psync [status $primary sync_partial_ok] $replica2 replicaof $primary_host $primary_port @@ -951,13 +950,11 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Sync did not start" } - if {$dualchannel == "yes"} { - # Wait for both replicas main conns to establish psync - wait_for_condition 50 1000 { - [status $primary sync_partial_ok] == $cur_psync + 1 - } else { - fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]" - } + # Wait for both replicas main conns to establish psync + wait_for_condition 50 1000 { + [status $primary sync_partial_ok] == $cur_psync + 1 + } else { + fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]" } catch {$replica2 shutdown nosave} @@ -971,7 +968,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } } } -} + start_server {tags {"dual-channel-replication external:skip"}} { set primary [srv 0 client] @@ -982,8 +979,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set repl-diskless-sync yes $primary config set dual-channel-replication-enabled yes $primary config set loglevel debug - $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry - + $primary config set repl-diskless-sync-delay 0 # Generating RDB will cost 500s(1000000 * 0.0001s) $primary debug populate 1000000 primary 1 $primary config set rdb-key-save-delay 100 @@ -1014,6 +1010,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"] assert {$replica_main_conn_id != ""} set loglines [count_log_lines -1] + $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry $primary client kill id $replica_main_conn_id # Wait for primary to abort the sync wait_for_condition 50 1000 { @@ -1034,6 +1031,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } test "Test dual-channel-replication replica rdb connection disconnected" { + $primary config set repl-diskless-sync-delay 0 $replica replicaof $primary_host $primary_port # Wait for sync session to start wait_for_condition 500 1000 { @@ -1048,6 +1046,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary debug log "killing replica rdb connection $replica_rdb_channel_id" assert {$replica_rdb_channel_id != ""} set loglines [count_log_lines -1] + $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry $primary client kill id $replica_rdb_channel_id # Wait for primary to abort the sync wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10 @@ -1063,6 +1062,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } test "Test dual-channel-replication primary reject set-rdb-client after client killed" { + $primary config set repl-diskless-sync-delay 0 # Ensure replica main channel will not handshake before rdb client is killed $replica debug pause-after-fork 1 $replica replicaof $primary_host $primary_port @@ -1077,6 +1077,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"] assert {$replica_rdb_channel_id != ""} $primary debug log "killing replica rdb connection $replica_rdb_channel_id" + $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry $primary client kill id $replica_rdb_channel_id # Wait for primary to abort the sync wait_and_resume_process 0 @@ -1154,7 +1155,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set repl-diskless-sync yes $primary config set dual-channel-replication-enabled yes $primary config set loglevel debug - $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry + $primary config set repl-diskless-sync-delay 0 # Generating RDB will cost 100 sec to generate $primary debug populate 10000 primary 1 @@ -1185,6 +1186,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"] assert {$replica_rdb_channel_id != ""} set loglines [count_log_lines -1] + $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry $primary client kill id $replica_rdb_channel_id # Wait for primary to abort the sync wait_for_condition 50 1000 { @@ -1192,6 +1194,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Primary did not free repl buf block after sync failure" } + $primary config set repl-diskless-sync-delay 0 wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10 # Replica should retry wait_for_condition 500 1000 { @@ -1200,7 +1203,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { [s -1 rdb_bgsave_in_progress] eq 1 } else { fail "replica didn't retry after connection close" - } + } } $replica replicaof no one wait_for_condition 500 1000 { @@ -1218,11 +1221,11 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "replica didn't start sync session in time" } - $primary debug log "killing replica main connection" set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"] assert {$replica_main_conn_id != ""} set loglines [count_log_lines -1] + $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry $primary client kill id $replica_main_conn_id # Wait for primary to abort the sync wait_for_condition 50 1000 { @@ -1230,6 +1233,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Primary did not free repl buf block after sync failure" } + $primary config set repl-diskless-sync-delay 0 wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10 # Replica should retry wait_for_condition 500 1000 { diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl index 4c305ebff4..88a33045f0 100644 --- a/tests/integration/replication-psync.tcl +++ b/tests/integration/replication-psync.tcl @@ -115,6 +115,10 @@ tags {"external:skip"} { foreach mdl {no yes} { foreach sdl {disabled swapdb} { foreach dualchannel {yes no} { + # Skip dual channel test with master diskless disabled + if {$dualchannel == "yes" && $mdl == "no"} { + continue + } test_psync {no reconnection, just sync} 6 1000000 3600 0 { } $mdl $sdl $dualchannel 0 From da15cee7591e8fb678dbc5c8cbf3e28e2a2e2380 Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Tue, 24 Dec 2024 08:14:32 +0200 Subject: [PATCH 036/101] Add scoped RDB loading context and immediate abort flag (#1173) This PR introduces a new mechanism for temporarily changing the server's loading_rio context during RDB loading operations. The new `RDB_SCOPED_LOADING_RIO` macro allows for a scoped change of the `server.loading_rio` value, ensuring that it's automatically restored to its original value when the scope ends. Introduces a dedicated flag to `rio` to signal immediate abort, preventing potential use-after-free scenarios during replication disconnection in dual-channel load. This ensures proper termination of `rdbLoadRioWithLoadingCtx` when replication is cancelled due to connection loss on main connection. Fixes https://github.com/valkey-io/valkey/issues/1152 --------- Signed-off-by: naglera Signed-off-by: Madelyn Olson Signed-off-by: Amit Nagler <58042354+naglera@users.noreply.github.com> Co-authored-by: Madelyn Olson Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com> --- src/rdb.c | 15 ++++- src/rdb.h | 2 +- src/replication.c | 15 ++--- src/rio.h | 16 ++++- src/server.c | 1 + src/server.h | 1 + .../integration/dual-channel-replication.tcl | 62 ++++++++++++++++++- 7 files changed, 95 insertions(+), 17 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 5fb77a2897..a4eb2823fb 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -64,6 +64,7 @@ char *rdbFileBeingLoaded = NULL; /* used for rdb checking on read error */ extern int rdbCheckMode; void rdbCheckError(const char *fmt, ...); void rdbCheckSetError(const char *fmt, ...); +int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx); #ifdef __GNUC__ void rdbReportError(int corruption_error, int linenum, char *reason, ...) __attribute__((format(printf, 3, 4))); @@ -2991,7 +2992,19 @@ int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx *lib_ctx, int rdbflags, s int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { functionsLibCtx *functions_lib_ctx = functionsLibCtxGetCurrent(); rdbLoadingCtx loading_ctx = {.dbarray = server.db, .functions_lib_ctx = functions_lib_ctx}; - int retval = rdbLoadRioWithLoadingCtx(rdb, rdbflags, rsi, &loading_ctx); + int retval = rdbLoadRioWithLoadingCtxScopedRdb(rdb, rdbflags, rsi, &loading_ctx); + return retval; +} + +/* Wrapper for rdbLoadRioWithLoadingCtx that manages a scoped RDB context. + * This method wraps the rdbLoadRioWithLoadingCtx function, providing temporary + * RDB context management. It sets a new current loading RDB, calls the wrapped + * function, and then restores the previous loading RDB context. */ +int rdbLoadRioWithLoadingCtxScopedRdb(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx) { + rio *prev_rio = server.loading_rio; + server.loading_rio = rdb; + int retval = rdbLoadRioWithLoadingCtx(rdb, rdbflags, rsi, rdb_loading_ctx); + server.loading_rio = prev_rio; return retval; } diff --git a/src/rdb.h b/src/rdb.h index e9d53fa398..7342a926b5 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -172,7 +172,7 @@ int rdbLoadBinaryDoubleValue(rio *rdb, double *val); int rdbSaveBinaryFloatValue(rio *rdb, float val); int rdbLoadBinaryFloatValue(rio *rdb, float *val); int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi); -int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx); +int rdbLoadRioWithLoadingCtxScopedRdb(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx); int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx *lib_ctx, int rdbflags, sds *err); int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi); ssize_t rdbSaveFunctions(rio *rdb); diff --git a/src/replication.c b/src/replication.c index 3a207a1d0f..f907771e71 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2254,7 +2254,7 @@ void readSyncBulkPayload(connection *conn) { int loadingFailed = 0; rdbLoadingCtx loadingCtx = {.dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx}; - if (rdbLoadRioWithLoadingCtx(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) { + if (rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) { /* RDB loading failed. */ serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization DB " "from socket, check server logs."); @@ -2831,18 +2831,15 @@ typedef struct replDataBufBlock { * Reads replication data from primary into specified repl buffer block */ int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t read) { int nread = connRead(conn, data_block->buf + data_block->used, read); - if (nread == -1) { - if (connGetState(conn) != CONN_STATE_CONNECTED) { - dualChannelServerLog(LL_NOTICE, "Error reading from primary: %s", connGetLastError(conn)); + if (nread <= 0) { + if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) { + dualChannelServerLog(LL_WARNING, "Provisional primary closed connection"); + /* Signal ongoing RDB load to terminate gracefully */ + if (server.loading_rio) rioCloseASAP(server.loading_rio); cancelReplicationHandshake(1); } return C_ERR; } - if (nread == 0) { - dualChannelServerLog(LL_VERBOSE, "Provisional primary closed connection"); - cancelReplicationHandshake(1); - return C_ERR; - } data_block->used += nread; server.stat_total_reads_processed++; return read - nread; diff --git a/src/rio.h b/src/rio.h index ee0f27aa7e..d5c3263e79 100644 --- a/src/rio.h +++ b/src/rio.h @@ -39,6 +39,7 @@ #define RIO_FLAG_READ_ERROR (1 << 0) #define RIO_FLAG_WRITE_ERROR (1 << 1) +#define RIO_FLAG_CLOSE_ASAP (1 << 2) /* Rio was closed asynchronously during the current rio operation. */ #define RIO_TYPE_FILE (1 << 0) #define RIO_TYPE_BUFFER (1 << 1) @@ -115,7 +116,7 @@ typedef struct _rio rio; * if needed. */ static inline size_t rioWrite(rio *r, const void *buf, size_t len) { - if (r->flags & RIO_FLAG_WRITE_ERROR) return 0; + if (r->flags & RIO_FLAG_WRITE_ERROR || r->flags & RIO_FLAG_CLOSE_ASAP) return 0; while (len) { size_t bytes_to_write = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len; @@ -132,7 +133,7 @@ static inline size_t rioWrite(rio *r, const void *buf, size_t len) { } static inline size_t rioRead(rio *r, void *buf, size_t len) { - if (r->flags & RIO_FLAG_READ_ERROR) return 0; + if (r->flags & RIO_FLAG_READ_ERROR || r->flags & RIO_FLAG_CLOSE_ASAP) return 0; while (len) { size_t bytes_to_read = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len; @@ -156,6 +157,10 @@ static inline int rioFlush(rio *r) { return r->flush(r); } +static inline void rioCloseASAP(rio *r) { + r->flags |= RIO_FLAG_CLOSE_ASAP; +} + /* This function allows to know if there was a read error in any past * operation, since the rio stream was created or since the last call * to rioClearError(). */ @@ -168,8 +173,13 @@ static inline int rioGetWriteError(rio *r) { return (r->flags & RIO_FLAG_WRITE_ERROR) != 0; } +/* Like rioGetReadError() but for async close errors. */ +static inline int rioGetAsyncCloseError(rio *r) { + return (r->flags & RIO_FLAG_CLOSE_ASAP) != 0; +} + static inline void rioClearErrors(rio *r) { - r->flags &= ~(RIO_FLAG_READ_ERROR | RIO_FLAG_WRITE_ERROR); + r->flags &= ~(RIO_FLAG_READ_ERROR | RIO_FLAG_WRITE_ERROR | RIO_FLAG_CLOSE_ASAP); } void rioInitWithFile(rio *r, FILE *fp); diff --git a/src/server.c b/src/server.c index a0c642b541..b997a9aec6 100644 --- a/src/server.c +++ b/src/server.c @@ -2218,6 +2218,7 @@ void initServerConfig(void) { server.fsynced_reploff_pending = 0; server.rdb_client_id = -1; server.loading_process_events_interval_ms = LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT; + server.loading_rio = NULL; /* Replication partial resync backlog */ server.repl_backlog = NULL; diff --git a/src/server.h b/src/server.h index d8497ccff5..61fa2c3c5a 100644 --- a/src/server.h +++ b/src/server.h @@ -2089,6 +2089,7 @@ struct valkeyServer { int dbid; } repl_provisional_primary; client *cached_primary; /* Cached primary to be reused for PSYNC. */ + rio *loading_rio; /* Pointer to the rio object currently used for loading data. */ int repl_syncio_timeout; /* Timeout for synchronous I/O calls */ int repl_state; /* Replication status if the instance is a replica */ int repl_rdb_channel_state; /* State of the replica's rdb channel during dual-channel-replication */ diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index b4b9286d68..3adf9ce9fd 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -1158,8 +1158,8 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set repl-diskless-sync-delay 0 # Generating RDB will cost 100 sec to generate - $primary debug populate 10000 primary 1 - $primary config set rdb-key-save-delay 10000 + $primary debug populate 100000 primary 1 + $primary config set rdb-key-save-delay 1000 start_server {} { set replica [srv 0 client] @@ -1222,7 +1222,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { fail "replica didn't start sync session in time" } $primary debug log "killing replica main connection" - set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"] + set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"] assert {$replica_main_conn_id != ""} set loglines [count_log_lines -1] $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry @@ -1247,3 +1247,59 @@ start_server {tags {"dual-channel-replication external:skip"}} { stop_write_load $load_handle } } + + +start_server {tags {"dual-channel-replication external:skip"}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + $primary config set repl-diskless-sync yes + $primary config set dual-channel-replication-enabled yes + $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry + + # Generating RDB will take 100 sec to generate + $primary debug populate 1000000 primary 1 + $primary config set rdb-key-save-delay -10 + + start_server {} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + set replica_log [srv 0 stdout] + + $replica config set dual-channel-replication-enabled yes + $replica config set loglevel debug + $replica config set repl-timeout 10 + $replica config set repl-diskless-load flush-before-load + + test "Replica notice main-connection killed during rdb load callback" {; # https://github.com/valkey-io/valkey/issues/1152 + set loglines [count_log_lines 0] + $replica replicaof $primary_host $primary_port + # Wait for sync session to start + wait_for_condition 500 1000 { + [string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] && + [string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] && + [s -1 rdb_bgsave_in_progress] eq 1 + } else { + fail "replica didn't start sync session in time" + } + wait_for_log_messages 0 {"*Loading RDB produced by Valkey version*"} $loglines 1000 10 + $primary set key val + set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"] + $primary debug log "killing replica main connection $replica_main_conn_id" + assert {$replica_main_conn_id != ""} + set loglines [count_log_lines 0] + $primary config set rdb-key-save-delay 0; # disable delay to allow next sync to succeed + $primary client kill id $replica_main_conn_id + # Wait for primary to abort the sync + wait_for_condition 50 1000 { + [string match {*replicas_waiting_psync:0*} [$primary info replication]] + } else { + fail "Primary did not free repl buf block after sync failure" + } + wait_for_log_messages 0 {"*Failed trying to load the PRIMARY synchronization DB from socket*"} $loglines 1000 10 + verify_replica_online $primary 0 500 + } + } +} From ff394270af692806c30b70c514f8a7a3851ccdc6 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 25 Dec 2024 10:57:42 +0800 Subject: [PATCH 037/101] Document all command flags near serverCommand (#1474) These flags are not documented here. Signed-off-by: Binbin --- src/server.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/server.h b/src/server.h index 61fa2c3c5a..424569f76f 100644 --- a/src/server.h +++ b/src/server.h @@ -248,6 +248,8 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CMD_ALLOW_BUSY ((1ULL << 26)) #define CMD_MODULE_GETCHANNELS (1ULL << 27) /* Use the modules getchannels interface. */ #define CMD_TOUCHES_ARBITRARY_KEYS (1ULL << 28) +/* Command flags. Please don't forget to add command flag documentation in struct + * serverCommand in this file. */ /* Command flags that describe ACLs categories. */ #define ACL_CATEGORY_KEYSPACE (1ULL << 0) @@ -2472,6 +2474,8 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc, * CMD_DENYOOM: May increase memory usage once called. Don't allow if out * of memory. * + * CMD_MODULE: Command exported by module. + * * CMD_ADMIN: Administrative command, like SAVE or SHUTDOWN. * * CMD_PUBSUB: Pub/Sub related command. @@ -2518,11 +2522,22 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc, * * CMD_NO_MANDATORY_KEYS: This key arguments for this command are optional. * + * CMD_PROTECTED: The command is a protected command, see enable-debug-command for more details. + * + * CMD_MODULE_GETKEYS: Use the modules getkeys interface. + * + * CMD_MODULE_NO_CLUSTER: Deny on cluster. + * * CMD_NO_MULTI: The command is not allowed inside a transaction * + * CMD_MOVABLE_KEYS: The legacy range spec doesn't cover all keys. Populated by + * populateCommandLegacyRangeSpec. + * * CMD_ALLOW_BUSY: The command can run while another command is running for * a long time (timedout script, module command that yields) * + * CMD_MODULE_GETCHANNELS: Use the modules getchannels interface. + * * CMD_TOUCHES_ARBITRARY_KEYS: The command may touch (and cause lazy-expire) * arbitrary key (i.e not provided in argv) * From 3d5acdd41d322abd99babd6abd6ca9dc3ce3d5c2 Mon Sep 17 00:00:00 2001 From: uriyage <78144248+uriyage@users.noreply.github.com> Date: Wed, 25 Dec 2024 04:58:49 +0200 Subject: [PATCH 038/101] Fix restore replica output bytes stat update (#1486) This PR fixes the missing stat update for `total_net_repl_output_bytes` that was removed during the refactoring in PR #758. The metric was not being updated when writing to replica connections. Changes: - Restored the stat update in postWriteToClient for replica connections - Added integration test to verify the metric is properly updated Signed-off-by: Uri Yagelnik Co-authored-by: Binbin --- src/networking.c | 2 ++ tests/integration/replication.tcl | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/networking.c b/src/networking.c index 9f36f24275..d93046a603 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2231,6 +2231,8 @@ int postWriteToClient(client *c) { server.stat_total_writes_processed++; if (getClientType(c) != CLIENT_TYPE_REPLICA) { _postWriteToClient(c); + } else { + server.stat_net_repl_output_bytes += c->nwritten > 0 ? c->nwritten : 0; } if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) { diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 1b5b0c030a..6d3c4e934f 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -194,6 +194,30 @@ start_server {tags {"repl external:skip"}} { } assert_match {*calls=1,*,rejected_calls=0,failed_calls=1*} [cmdrstat blpop $B] } + + test {Replica output bytes metric} { + # reset stats + $A config resetstat + + set info [$A info stats] + set replica_bytes_output [getInfoProperty $info "total_net_repl_output_bytes"] + assert_equal $replica_bytes_output 0 + + # sent set command to primary + $A set key value + + # wait for command propagation + wait_for_condition 50 100 { + [$B get key] eq {value} + } else { + fail "Replica did not receive the command" + } + + # get the new stats + set info [$A info stats] + set replica_bytes_output [getInfoProperty $info "total_net_repl_output_bytes"] + assert_morethan $replica_bytes_output 0 + } } } From 2f07b663bc6075f685f9140f0a7b58759b90c0c4 Mon Sep 17 00:00:00 2001 From: gmbnomis Date: Fri, 27 Dec 2024 00:55:20 +0100 Subject: [PATCH 039/101] Fix JSON description of SET command (#1473) In the `arguments` section, the `arguments` key is only used for arguments of type `block` or `oneof`. Consequently, the `arguments` given for `IFEQ` are ignored by the server. However, they lead to strange results when rendering the command's page for the web documentation. Fix this by removing `arguments` for `IFEQ`. Signed-off-by: Simon Baatz --- src/commands/set.json | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/commands/set.json b/src/commands/set.json index 3d3800f11d..601bd676a2 100644 --- a/src/commands/set.json +++ b/src/commands/set.json @@ -111,14 +111,7 @@ "type": "string", "token": "IFEQ", "since": "8.1.0", - "summary": "Sets the key's value only if the current value matches the specified comparison value.", - "arguments": [ - { - "name": "comparison-value", - "type": "string", - "summary": "The value to compare with the current key's value before setting." - } - ] + "summary": "Sets the key's value only if the current value matches the specified comparison value." } ] }, From 810a437da066a3479f17084e9a3f4b8910b3b7ed Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Sun, 29 Dec 2024 08:22:49 -0800 Subject: [PATCH 040/101] Immediately restart the defrag cycle if we still need to defrag (#1492) --- src/defrag.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/defrag.c b/src/defrag.c index e9f40d4fab..a5d6c69c1c 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -1112,6 +1112,9 @@ static void endDefragCycle(bool normal_termination) { server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time); server.stat_last_active_defrag_time = 0; server.active_defrag_cpu_percent = 0; + + /* Immediately check to see if we should start another defrag cycle. */ + monitorActiveDefrag(); } From 4f59458502100e8f971242222d841a97ad26b312 Mon Sep 17 00:00:00 2001 From: Pierre <105686771+pieturin@users.noreply.github.com> Date: Mon, 30 Dec 2024 12:56:39 -0800 Subject: [PATCH 041/101] Only (re-)send MEET packet once every handshake timeout period (#1441) Add `meet_sent` field in `clusterNode` indicating the last time we sent a MEET packet. Use this field to only (re-)send a MEET packet once every handshake timeout period when detecting a node without an inbound link. When receiving multiple MEET packets on the same link while the node is in handshake state, instead of dropping the packet, we now simply prevent the creation of a new node. This way we still process the MEET packet's gossip and reply with a PONG as any other packets. Improve some logging messages to include `human_nodename`. Add `nodeExceedsHandshakeTimeout()` function. This is a follow-up to this previous PR: https://github.com/valkey-io/valkey/pull/1307 And a partial fix to the crash described in: https://github.com/valkey-io/valkey/pull/1436 --------- Signed-off-by: Pierre Turin --- src/cluster_legacy.c | 139 ++++++++++--------- src/cluster_legacy.h | 1 + tests/unit/cluster/cluster-reliable-meet.tcl | 7 +- 3 files changed, 79 insertions(+), 68 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 3d838dfe06..80889a79d8 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -121,6 +121,7 @@ void freeClusterLink(clusterLink *link); int verifyClusterNodeId(const char *name, int length); sds clusterEncodeOpenSlotsAuxField(int rdbflags); int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s); +static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now); /* Only primaries that own slots have voting rights. * Returns 1 if the node has voting rights, otherwise returns 0. */ @@ -1346,9 +1347,10 @@ clusterLink *createClusterLink(clusterNode *node) { * with this link will have the 'link' field set to NULL. */ void freeClusterLink(clusterLink *link) { serverAssert(link != NULL); - serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s", + serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s (%s)", link->node ? link->node->name : "", - link->inbound ? "inbound" : "outbound"); + link->inbound ? "inbound" : "outbound", + link->node ? link->node->human_nodename : ""); if (link->conn) { connClose(link->conn); @@ -1502,6 +1504,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { node->last_in_ping_gossip = 0; node->ping_sent = node->pong_received = 0; node->data_received = 0; + node->meet_sent = 0; node->fail_time = 0; node->link = NULL; node->inbound_link = NULL; @@ -1723,7 +1726,7 @@ void clusterAddNode(clusterNode *node) { */ void clusterDelNode(clusterNode *delnode) { serverAssert(delnode != NULL); - serverLog(LL_DEBUG, "Deleting node %.40s from cluster view", delnode->name); + serverLog(LL_DEBUG, "Deleting node %.40s (%s) from cluster view", delnode->name, delnode->human_nodename); int j; dictIterator *di; @@ -3143,27 +3146,6 @@ int clusterProcessPacket(clusterLink *link) { return 1; } - if (type == CLUSTERMSG_TYPE_MEET && link->node && nodeInHandshake(link->node)) { - /* If the link is bound to a node and the node is in the handshake state, and we receive - * a MEET packet, it may be that the sender sent multiple MEET packets so in here we are - * dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert - * will happen if the other sends a MEET packet because it detects that there is no inbound - * link, this node creates a new node in HANDSHAKE state (with a random node name), and - * respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET - * flag. This node is supposed to open an outbound connection to the other node in the next - * cron cycle, but before this happens, the other node re-sends a MEET on the same link - * because it still detects no inbound connection. We improved the re-send logic of MEET in - * #1441, now we will only re-send MEET packet once every handshake timeout period. - * - * Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name - * and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent - * us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET - * packet eliminate the handshake state. */ - serverLog(LL_NOTICE, "Dropping MEET packet from node %.40s because the node is already in handshake state", - link->node->name); - return 1; - } - uint16_t flags = ntohs(hdr->flags); uint64_t sender_claimed_current_epoch = 0, sender_claimed_config_epoch = 0; clusterNode *sender = getNodeFromLinkAndMsg(link, hdr); @@ -3261,42 +3243,59 @@ int clusterProcessPacket(clusterLink *link) { if (type == CLUSTERMSG_TYPE_MEET) { if (!sender) { - /* Add this node if it is new for us and the msg type is MEET. - * In this stage we don't try to add the node with the right - * flags, replicaof pointer, and so forth, as this details will be - * resolved when we'll receive PONGs from the node. The exception - * to this is the flag that indicates extensions are supported, as - * we want to send extensions right away in the return PONG in order - * to reduce the amount of time needed to stabilize the shard ID. */ - clusterNode *node; - - node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); - serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK); - getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); - node->cport = ntohs(hdr->cport); - if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { - node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED; + if (!link->node) { + /* Add this node if it is new for us and the msg type is MEET. + * In this stage we don't try to add the node with the right + * flags, replicaof pointer, and so forth, as this details will be + * resolved when we'll receive PONGs from the node. The exception + * to this is the flag that indicates extensions are supported, as + * we want to send extensions right away in the return PONG in order + * to reduce the amount of time needed to stabilize the shard ID. */ + clusterNode *node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); + if (nodeIp2String(node->ip, link, hdr->myip) != C_OK) { + /* We cannot get the IP info from the link, it probably means the connection is closed. */ + serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, " + "because the connection has an error"); + freeClusterLink(link); + freeClusterNode(node); + return 0; + } + getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); + node->cport = ntohs(hdr->cport); + if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { + node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED; + } + setClusterNodeToInboundClusterLink(node, link); + clusterAddNode(node); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } else { + /* A second MEET packet was received on an existing link during the handshake process. + * This happens when the other node detects no inbound link, and re-sends a MEET packet + * before this node can respond with a PING. This MEET is a no-op. + * + * Note: Nodes in HANDSHAKE state are not fully "known" (random names), so the sender + * remains unidentified at this point. The MEET packet might be re-sent if the inbound + * connection is still unestablished by the next cron cycle. + */ + debugServerAssert(link->inbound && nodeInHandshake(link->node)); } - setClusterNodeToInboundClusterLink(node, link); - clusterAddNode(node); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); /* If this is a MEET packet from an unknown node, we still process * the gossip section here since we have to trust the sender because * of the message type. */ clusterProcessGossipSection(hdr, link); - } else if (sender->link && now - sender->ctime > server.cluster_node_timeout) { + } else if (sender->link && nodeExceedsHandshakeTimeout(sender, now)) { /* The MEET packet is from a known node, after the handshake timeout, so the sender thinks that I do not * know it. - * Freeing my outbound link to that node, to force a reconnect and sending a PING. + * Free my outbound link to that node, triggering a reconnect and a PING over the new link. * Once that node receives our PING, it should recognize the new connection as an inbound link from me. * We should only free the outbound link if the node is known for more time than the handshake timeout, * since during this time, the other side might still be trying to complete the handshake. */ /* We should always receive a MEET packet on an inbound link. */ serverAssert(link != sender->link); - serverLog(LL_NOTICE, "Freeing outbound link to node %.40s after receiving a MEET packet from this known node", - sender->name); + serverLog(LL_NOTICE, "Freeing outbound link to node %.40s (%s) after receiving a MEET packet from this known node", + sender->name, sender->human_nodename); freeClusterLink(sender->link); } } @@ -4062,7 +4061,12 @@ void clusterSendPing(clusterLink *link, int type) { clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen); clusterMsg *hdr = getMessageFromSendBlock(msgblock); - if (!link->inbound && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime(); + if (!link->inbound) { + if (type == CLUSTERMSG_TYPE_PING) + link->node->ping_sent = mstime(); + else if (type == CLUSTERMSG_TYPE_MEET) + link->node->meet_sent = mstime(); + } /* Populate the gossip fields */ int maxiterations = wanted * 3; @@ -4981,10 +4985,22 @@ void clusterHandleManualFailover(void) { * CLUSTER cron job * -------------------------------------------------------------------------- */ +static mstime_t getHandshakeTimeout(void) { + /* The handshake timeout is the time after which a handshake node that was + * not turned into a normal node is removed from the nodes. Usually it is + * just the cluster_node_timeout value, but when cluster_node_timeout is + * too small we use the value of 1 second. */ + return max(server.cluster_node_timeout, 1000); +} + +static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now) { + return now - node->ctime > getHandshakeTimeout() ? 1 : 0; +} + /* Check if the node is disconnected and re-establish the connection. * Also update a few stats while we are here, that can be used to make * better decisions in other part of the code. */ -static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) { +static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t now) { /* Not interested in reconnecting the link with myself or nodes * for which we have no address. */ if (node->flags & (CLUSTER_NODE_MYSELF | CLUSTER_NODE_NOADDR)) return 1; @@ -4993,19 +5009,22 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_ /* A Node in HANDSHAKE state has a limited lifespan equal to the * configured node timeout. */ - if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { - serverLog(LL_WARNING, "Clusterbus handshake timeout %s:%d after %lldms", node->ip, - node->cport, handshake_timeout); + if (nodeInHandshake(node) && nodeExceedsHandshakeTimeout(node, now)) { + serverLog(LL_WARNING, "Clusterbus handshake timeout %s:%d", node->ip, node->cport); clusterDelNode(node); return 1; } - if (node->link != NULL && node->inbound_link == NULL && nodeInNormalState(node) && - now - node->inbound_link_freed_time > handshake_timeout) { + if (nodeInNormalState(node) && node->link != NULL && node->inbound_link == NULL && + now - node->inbound_link_freed_time > getHandshakeTimeout() && + now - node->meet_sent > getHandshakeTimeout()) { /* Node has an outbound link, but no inbound link for more than the handshake timeout. * This probably means this node does not know us yet, whereas we know it. - * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view. */ + * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view. + * We make sure to not re-send a MEET packet more than once every handshake timeout period, so as to + * leave the other node time to complete the handshake. */ node->flags |= CLUSTER_NODE_MEET; - serverLog(LL_NOTICE, "Sending MEET packet to node %.40s because there is no inbound link for it", node->name); + serverLog(LL_NOTICE, "Sending MEET packet to node %.40s (%s) because there is no inbound link for it", + node->name, node->human_nodename); clusterSendPing(node->link, CLUSTERMSG_TYPE_MEET); } @@ -5066,19 +5085,11 @@ void clusterCron(void) { mstime_t min_pong = 0, now = mstime(); clusterNode *min_pong_node = NULL; static unsigned long long iteration = 0; - mstime_t handshake_timeout; iteration++; /* Number of times this function was called so far. */ clusterUpdateMyselfHostname(); - /* The handshake timeout is the time after which a handshake node that was - * not turned into a normal node is removed from the nodes. Usually it is - * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use - * the value of 1 second. */ - handshake_timeout = server.cluster_node_timeout; - if (handshake_timeout < 1000) handshake_timeout = 1000; - /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */ server.cluster->stats_pfail_nodes = 0; /* Run through some of the operations we want to do on each cluster node. */ @@ -5091,7 +5102,7 @@ void clusterCron(void) { /* The protocol is that function(s) below return non-zero if the node was * terminated. */ - if (clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue; + if (clusterNodeCronHandleReconnect(node, now)) continue; } dictReleaseIterator(di); diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index d3e1c3459e..ac14bd583c 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -340,6 +340,7 @@ struct _clusterNode { mstime_t ping_sent; /* Unix time we sent latest ping */ mstime_t pong_received; /* Unix time we received the pong */ mstime_t data_received; /* Unix time we received any data */ + mstime_t meet_sent; /* Unix time we sent latest meet packet */ mstime_t fail_time; /* Unix time when FAIL flag was set */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned primary condition */ diff --git a/tests/unit/cluster/cluster-reliable-meet.tcl b/tests/unit/cluster/cluster-reliable-meet.tcl index f189e96d5b..e32bbdab11 100644 --- a/tests/unit/cluster/cluster-reliable-meet.tcl +++ b/tests/unit/cluster/cluster-reliable-meet.tcl @@ -70,7 +70,7 @@ tags {tls:skip external:skip cluster} { [CI 0 cluster_stats_messages_meet_received] >= 4 && [CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received] } else { - fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]" + fail "Unexpected cluster state: node 1 cluster_state:[CI 1 cluster_state], node 0 cluster_state: [CI 0 cluster_state]" } } } ;# stop servers @@ -178,14 +178,13 @@ start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout # Wait for Node 0's handshake to timeout wait_for_condition 50 100 { - [cluster_get_first_node_in_handshake 1] eq {} + [cluster_get_first_node_in_handshake 0] eq {} } else { fail "Node 0 never exited handshake state" } - # At this point Node 0 knows Node 1 & 2 through the gossip, but they don't know Node 0. + # At this point Node 0 knows Node 2 through the gossip, but Node 1 & 2 don't know Node 0. wait_for_condition 50 100 { - [cluster_get_node_by_id 0 $node1_id] != {} && [cluster_get_node_by_id 0 $node2_id] != {} && [cluster_get_node_by_id 1 $node0_id] eq {} && [cluster_get_node_by_id 2 $node0_id] eq {} From 68f50585129b69153f2cdceee955791373a237ab Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Tue, 31 Dec 2024 04:58:06 +0800 Subject: [PATCH 042/101] Make global configs as static (#1159) Don't expose static configs symbol, and make configEnumGetValue as static function. Signed-off-by: zhenwei pi --- src/config.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/config.c b/src/config.c index f08b79ebbd..59cf0d9400 100644 --- a/src/config.c +++ b/src/config.c @@ -283,7 +283,7 @@ struct standardConfig { void *privdata; /* privdata for this config, for module configs this is a ModuleConfig struct */ }; -dict *configs = NULL; /* Runtime config values */ +static dict *configs = NULL; /* Runtime config values */ /* Lookup a config by the provided sds string name, or return NULL * if the config does not exist */ @@ -297,7 +297,7 @@ static standardConfig *lookupConfig(sds name) { *----------------------------------------------------------------------------*/ /* Get enum value from name. If there is no match INT_MIN is returned. */ -int configEnumGetValue(configEnum *ce, sds *argv, int argc, int bitflags) { +static int configEnumGetValue(configEnum *ce, sds *argv, int argc, int bitflags) { if (argc == 0 || (!bitflags && argc != 1)) return INT_MIN; int values = 0; for (int i = 0; i < argc; i++) { From 399fb0881dd7f90293431b3a102825372256e291 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Wed, 1 Jan 2025 16:33:09 +0200 Subject: [PATCH 043/101] Align rejected unblocked commands to update the correct error statistic (#577) Currently, in case a blocked command is unblocked externally (eg. due to the relevant slot being migrated or the CLIENT UNBLOCK command was issued, the command statistics will always update the failed_calls error statistic. This leads to missalignment with https://github.com/valkey-io/valkey/commit/90b9f08e5d1657e7bfffe43f31f6663bf469ee75 as well as some inconsistencies. For example when a key is migrated during cluster slot migration, clients blocked on XREADGROUP will be unblocked and update the rejected_calls stat, while clients blocked on BLPOP will get unblocked updating the failed_calls stat. In this PR we add explicit indication in updateStatsOnUnblock thet indicates if the command was rejected or failed. --------- Signed-off-by: ranshid Signed-off-by: Ran Shidlansik --- src/blocked.c | 23 ++++++++++++++++++----- src/module.c | 5 +++-- src/server.h | 2 +- tests/integration/replication.tcl | 2 +- tests/unit/info.tcl | 2 +- 5 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index aeec560b3f..d356ea5c07 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -105,15 +105,27 @@ void blockClient(client *c, int btype) { * he will attempt to reprocess the command which will update the statistics. * However in case the client was timed out or in case of module blocked client is being unblocked * the command will not be reprocessed and we need to make stats update. - * This function will make updates to the commandstats, slot-stats, slowlog and monitors.*/ -void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int had_errors) { + * This function will make updates to the commandstats, slot-stats, slowlog and monitors. + * The failed_or_rejected parameter is an indication that the blocked command was either failed internally or + * rejected/aborted externally. In case the command was rejected the value ERROR_COMMAND_REJECTED should be passed. + * In case the command failed internally, ERROR_COMMAND_FAILED should be passed. + * A value of zero indicate no error was reported after the command was unblocked */ +void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_or_rejected) { const ustime_t total_cmd_duration = c->duration + blocked_us + reply_us; c->lastcmd->microseconds += total_cmd_duration; clusterSlotStatsAddCpuDuration(c, total_cmd_duration); c->lastcmd->calls++; c->commands_processed++; server.stat_numcommands++; - if (had_errors) c->lastcmd->failed_calls++; + debugServerAssertWithInfo(c, NULL, failed_or_rejected >= 0 && failed_or_rejected <= ERROR_COMMAND_FAILED); + if (failed_or_rejected) { + if (failed_or_rejected & ERROR_COMMAND_FAILED) + c->lastcmd->failed_calls++; + else if (failed_or_rejected & ERROR_COMMAND_REJECTED) + c->lastcmd->rejected_calls++; + else + debugServerAssertWithInfo(c, NULL, 0); + } if (server.latency_tracking_enabled) updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), total_cmd_duration * 1000); /* Log the command into the Slow log if needed. */ @@ -680,7 +692,8 @@ static void moduleUnblockClientOnKey(client *c, robj *key) { elapsedStart(&replyTimer); if (moduleTryServeClientBlockedOnKey(c, key)) { - updateStatsOnUnblock(c, 0, elapsedUs(replyTimer), server.stat_total_error_replies != prev_error_replies); + updateStatsOnUnblock(c, 0, elapsedUs(replyTimer), + ((server.stat_total_error_replies != prev_error_replies) ? ERROR_COMMAND_FAILED : 0)); moduleUnblockClient(c); } /* We need to call afterCommand even if the client was not unblocked @@ -709,7 +722,7 @@ void unblockClientOnTimeout(client *c) { * If err_str is provided it will be used to reply to the blocked client */ void unblockClientOnError(client *c, const char *err_str) { if (err_str) addReplyError(c, err_str); - updateStatsOnUnblock(c, 0, 0, 1); + updateStatsOnUnblock(c, 0, 0, ERROR_COMMAND_REJECTED); if (c->flag.pending_command) c->flag.pending_command = 0; unblockClient(c, 1); } diff --git a/src/module.c b/src/module.c index db493dd8bc..a8676cb727 100644 --- a/src/module.c +++ b/src/module.c @@ -8325,7 +8325,7 @@ void moduleHandleBlockedClients(void) { if (c && !clientHasModuleAuthInProgress(c)) { int had_errors = c->deferred_reply_errors ? !!listLength(c->deferred_reply_errors) : (server.stat_total_error_replies != prev_error_replies); - updateStatsOnUnblock(c, bc->background_duration, reply_us, had_errors); + updateStatsOnUnblock(c, bc->background_duration, reply_us, (had_errors ? ERROR_COMMAND_FAILED : 0)); } if (c != NULL) { @@ -8411,7 +8411,8 @@ void moduleBlockedClientTimedOut(client *c, int from_module) { moduleFreeContext(&ctx); if (!from_module) - updateStatsOnUnblock(c, bc->background_duration, 0, server.stat_total_error_replies != prev_error_replies); + updateStatsOnUnblock(c, bc->background_duration, 0, + ((server.stat_total_error_replies != prev_error_replies) ? ERROR_COMMAND_FAILED : 0)); /* For timeout events, we do not want to call the disconnect callback, * because the blocked client will be automatically disconnected in diff --git a/src/server.h b/src/server.h index 424569f76f..f4fb663851 100644 --- a/src/server.h +++ b/src/server.h @@ -3735,7 +3735,7 @@ void blockPostponeClient(client *c); void blockClientForReplicaAck(client *c, mstime_t timeout, long long offset, long numreplicas, int numlocal); void replicationRequestAckFromReplicas(void); void signalDeletedKeyAsReady(serverDb *db, robj *key, int type); -void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int had_errors); +void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_or_rejected); void scanDatabaseForDeletedKeys(serverDb *emptied, serverDb *replaced_with); void totalNumberOfStatefulKeys(unsigned long *blocking_keys, unsigned long *blocking_keys_on_nokey, diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 6d3c4e934f..de7837a9a8 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -192,7 +192,7 @@ start_server {tags {"repl external:skip"}} { } else { fail "Master and replica have different digest: [$A debug digest] VS [$B debug digest]" } - assert_match {*calls=1,*,rejected_calls=0,failed_calls=1*} [cmdrstat blpop $B] + assert_match {*calls=1,*,rejected_calls=1*,failed_calls=0} [cmdrstat blpop $B] } test {Replica output bytes metric} { diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 3295c5e31a..4a638cac80 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -269,7 +269,7 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} { r client unblock $rd_id error assert_error {UNBLOCKED*} {$rd read} assert_match {*count=1*} [errorstat UNBLOCKED] - assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat blpop] + assert_match {*calls=1,*,rejected_calls=1,failed_calls=0} [cmdstat blpop] assert_equal [s total_error_replies] 1 $rd close } From 471ecf4accdbade989ca17e65496db4a6c907830 Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Thu, 2 Jan 2025 04:00:29 +0200 Subject: [PATCH 044/101] Fix unreliable dual channel Valgrind tests (#1500) Used same approach as PR #1165 to solve random failures. Resolves #1491 Signed-off-by: naglera --- tests/integration/dual-channel-replication.tcl | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 3adf9ce9fd..4ca70651a1 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -1256,7 +1256,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set repl-diskless-sync yes $primary config set dual-channel-replication-enabled yes - $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry + $primary config set repl-diskless-sync-delay 0 # Generating RDB will take 100 sec to generate $primary debug populate 1000000 primary 1 @@ -1270,9 +1270,18 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica config set dual-channel-replication-enabled yes $replica config set loglevel debug - $replica config set repl-timeout 10 $replica config set repl-diskless-load flush-before-load + if {$::valgrind} { + $primary config set repl-timeout 100 + $replica config set repl-timeout 100 + set max_tries 5000 + } else { + $primary config set repl-timeout 10 + $replica config set repl-timeout 10 + set max_tries 500 + } + test "Replica notice main-connection killed during rdb load callback" {; # https://github.com/valkey-io/valkey/issues/1152 set loglines [count_log_lines 0] $replica replicaof $primary_host $primary_port @@ -1287,6 +1296,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { wait_for_log_messages 0 {"*Loading RDB produced by Valkey version*"} $loglines 1000 10 $primary set key val set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"] + $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry $primary debug log "killing replica main connection $replica_main_conn_id" assert {$replica_main_conn_id != ""} set loglines [count_log_lines 0] @@ -1298,8 +1308,8 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Primary did not free repl buf block after sync failure" } - wait_for_log_messages 0 {"*Failed trying to load the PRIMARY synchronization DB from socket*"} $loglines 1000 10 - verify_replica_online $primary 0 500 + wait_for_log_messages 0 {"*Failed trying to load the PRIMARY synchronization DB from socket*"} $loglines $max_tries 10 + verify_replica_online $primary 0 $max_tries } } } From 15189c9b7eb56a537f848cabc872a645d41016fc Mon Sep 17 00:00:00 2001 From: uriyage <78144248+uriyage@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:01:55 +0200 Subject: [PATCH 045/101] replication: fix io-threads possible race by moving waitForClientIO (#1422) ### Fix race with pending writes in replica state transition #### The Problem In #60 (Dual channel replication) a new `connWrite` call was added before the `waitForClientIO` check. This created a race condition where the main thread may attempt to write to a client that could have pending writes in IO threads. #### The Fix Moved the `waitForClientIO()` call earlier in `syncCommand`, before any `connWrite` call. This ensures all pending IO operations are completed before attempting to write to the client. --------- Signed-off-by: Uri Yagelnik --- src/replication.c | 5 +++-- src/socket.c | 10 ++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/replication.c b/src/replication.c index f907771e71..160b0c4d5e 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1036,6 +1036,9 @@ void syncCommand(client *c) { /* ignore SYNC if already replica or in monitor mode */ if (c->flag.replica) return; + /* Wait for any IO pending operation to finish before changing the client state to replica */ + waitForClientIO(c); + /* Check if this is a failover request to a replica with the same replid and * become a primary if so. */ if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr, "psync") && !strcasecmp(c->argv[3]->ptr, "failover")) { @@ -1148,8 +1151,6 @@ void syncCommand(client *c) { c->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; if (server.repl_disable_tcp_nodelay) connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; - /* Wait for any IO pending operation to finish before changing the client state */ - waitForClientIO(c); c->flag.replica = 1; listAddNodeTail(server.replicas, c); diff --git a/src/socket.c b/src/socket.c index d89e6c8767..94869f3f25 100644 --- a/src/socket.c +++ b/src/socket.c @@ -29,6 +29,7 @@ #include "server.h" #include "connhelpers.h" +#include "io_threads.h" /* The connections module provides a lean abstraction of network connections * to avoid direct socket and async event management across the server code base. @@ -154,6 +155,10 @@ static void connSocketClose(connection *conn) { } static int connSocketWrite(connection *conn, const void *data, size_t data_len) { + /* Assert the main thread is not writing to a connection that is currently offloaded. */ + debugServerAssert(!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD) || !inMainThread() || + ((client *)connGetPrivateData(conn))->io_write_state != CLIENT_PENDING_IO); + int ret = write(conn->fd, data, data_len); if (ret < 0 && errno != EAGAIN) { conn->last_errno = errno; @@ -182,6 +187,11 @@ static int connSocketWritev(connection *conn, const struct iovec *iov, int iovcn } static int connSocketRead(connection *conn, void *buf, size_t buf_len) { + /* Assert the main thread is not reading from a connection that is currently offloaded. */ + debugServerAssert(!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD) || !inMainThread() || + ((client *)connGetPrivateData(conn))->io_read_state != CLIENT_PENDING_IO); + + int ret = read(conn->fd, buf, buf_len); if (!ret) { conn->state = CONN_STATE_CLOSED; From f7ac777e52c75670db8d6638b6f8dbbcaf96b714 Mon Sep 17 00:00:00 2001 From: uriyage <78144248+uriyage@users.noreply.github.com> Date: Thu, 2 Jan 2025 11:42:39 +0200 Subject: [PATCH 046/101] Offload reading the replication stream to IO threads (#1449) Support Primary client IO offload. Related issue: https://github.com/valkey-io/valkey/issues/761 --------- Signed-off-by: Uri Yagelnik --- src/io_threads.c | 9 +++++---- src/networking.c | 16 +++++++++++++++- src/replication.c | 3 +++ src/server.h | 1 + 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/io_threads.c b/src/io_threads.c index 90f5b88700..66ef4948b6 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -321,8 +321,8 @@ int trySendReadToIOThreads(client *c) { if (server.active_io_threads_num <= 1) return C_ERR; /* If IO thread is already reading, return C_OK to make sure the main thread will not handle it. */ if (c->io_read_state != CLIENT_IDLE) return C_OK; - /* Currently, replica/master writes are not offloaded and are processed synchronously. */ - if (c->flag.primary || getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; + /* Currently, replica reads are not offloaded to IO threads. */ + if (getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; /* With Lua debug client we may call connWrite directly in the main thread */ if (c->flag.lua_debug) return C_ERR; /* For simplicity let the main-thread handle the blocked clients */ @@ -345,6 +345,7 @@ int trySendReadToIOThreads(client *c) { c->cur_tid = tid; c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE; c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; + c->read_flags |= c->flag.primary ? READ_FLAGS_PRIMARY : 0; c->io_read_state = CLIENT_PENDING_IO; connSetPostponeUpdateState(c->conn, 1); @@ -363,8 +364,8 @@ int trySendWriteToIOThreads(client *c) { if (c->io_write_state != CLIENT_IDLE) return C_OK; /* Nothing to write */ if (!clientHasPendingReplies(c)) return C_ERR; - /* Currently, replica/master writes are not offloaded and are processed synchronously. */ - if (c->flag.primary || getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; + /* Currently, replica writes are not offloaded to IO threads. */ + if (getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; /* We can't offload debugged clients as the main-thread may read at the same time */ if (c->flag.lua_debug) return C_ERR; diff --git a/src/networking.c b/src/networking.c index d93046a603..2190fca5bf 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2592,6 +2592,16 @@ void resetClient(client *c) { } } +void resetClientIOState(client *c) { + c->nwritten = 0; + c->nread = 0; + c->io_read_state = c->io_write_state = CLIENT_IDLE; + c->io_parsed_cmd = NULL; + c->flag.pending_command = 0; + c->io_last_bufpos = 0; + c->io_last_reply_block = NULL; +} + /* Initializes the shared query buffer to a new sds with the default capacity. * Need to ensure the initlen is not less than readlen in readToQueryBuf. */ void initSharedQueryBuf(void) { @@ -4962,7 +4972,11 @@ void ioThreadReadQueryFromClient(void *data) { } done: - trimClientQueryBuffer(c); + /* Only trim query buffer for non-primary clients + * Primary client's buffer is handled by main thread using repl_applied position */ + if (!(c->read_flags & READ_FLAGS_PRIMARY)) { + trimClientQueryBuffer(c); + } atomic_thread_fence(memory_order_release); c->io_read_state = CLIENT_COMPLETED_IO; } diff --git a/src/replication.c b/src/replication.c index 160b0c4d5e..bec52a84d0 100644 --- a/src/replication.c +++ b/src/replication.c @@ -4134,6 +4134,8 @@ void replicationCachePrimary(client *c) { serverAssert(server.primary != NULL && server.cached_primary == NULL); serverLog(LL_NOTICE, "Caching the disconnected primary state."); + /* Wait for IO operations to be done before proceeding */ + waitForClientIO(c); /* Unlink the client from the server structures. */ unlinkClient(c); @@ -4151,6 +4153,7 @@ void replicationCachePrimary(client *c) { c->reply_bytes = 0; c->bufpos = 0; resetClient(c); + resetClientIOState(c); /* Save the primary. Server.primary will be set to null later by * replicationHandlePrimaryDisconnection(). */ diff --git a/src/server.h b/src/server.h index f4fb663851..582392bca0 100644 --- a/src/server.h +++ b/src/server.h @@ -2831,6 +2831,7 @@ void logInvalidUseAndFreeClientAsync(client *c, const char *fmt, ...); void beforeNextClient(client *c); void clearClientConnectionState(client *c); void resetClient(client *c); +void resetClientIOState(client *c); void freeClientOriginalArgv(client *c); void freeClientArgv(client *c); void sendReplyToClient(connection *conn); From 245f51df9b27719f1e3ea69ed9f060bd10d98998 Mon Sep 17 00:00:00 2001 From: Wen Hui Date: Thu, 2 Jan 2025 10:12:09 -0500 Subject: [PATCH 047/101] Remove releasetools folder (#1496) The release tool utils\releasetools\ does not work anymore in Valkey, in this PR, we remove it. Signed-off-by: hwware --- utils/releasetools/01_create_tarball.sh | 14 ---------- utils/releasetools/02_upload_tarball.sh | 23 ---------------- utils/releasetools/03_test_release.sh | 28 -------------------- utils/releasetools/04_release_hash.sh | 13 --------- utils/releasetools/changelog.tcl | 35 ------------------------- 5 files changed, 113 deletions(-) delete mode 100755 utils/releasetools/01_create_tarball.sh delete mode 100755 utils/releasetools/02_upload_tarball.sh delete mode 100755 utils/releasetools/03_test_release.sh delete mode 100755 utils/releasetools/04_release_hash.sh delete mode 100755 utils/releasetools/changelog.tcl diff --git a/utils/releasetools/01_create_tarball.sh b/utils/releasetools/01_create_tarball.sh deleted file mode 100755 index 08fdcb6d16..0000000000 --- a/utils/releasetools/01_create_tarball.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -if [ $# != "1" ] -then - echo "Usage: ./utils/releasetools/01_create_tarball.sh " - exit 1 -fi - -TAG=$1 -TARNAME="valkey-${TAG}.tar" -echo "Generating /tmp/${TARNAME}" -git archive $TAG --prefix valkey-${TAG}/ > /tmp/$TARNAME || exit 1 -echo "Gizipping the archive" -rm -f /tmp/$TARNAME.gz -gzip -9 /tmp/$TARNAME diff --git a/utils/releasetools/02_upload_tarball.sh b/utils/releasetools/02_upload_tarball.sh deleted file mode 100755 index dcd94ef383..0000000000 --- a/utils/releasetools/02_upload_tarball.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -if [ $# != "1" ] -then - echo "Usage: ./utils/releasetools/02_upload_tarball.sh " - exit 1 -fi - -echo "Uploading..." -scp /tmp/valkey-${1}.tar.gz ubuntu@host.redis.io:/var/www/download/releases/ -echo "Updating web site... " -echo "Please check the github action tests for the release." -echo "Press any key if it is a stable release, or Ctrl+C to abort" -read x -ssh ubuntu@host.redis.io "cd /var/www/download; - rm -rf valkey-${1}.tar.gz; - wget http://download.redis.io/releases/redis-${1}.tar.gz; - tar xvzf redis-${1}.tar.gz; - rm -rf valkey-stable; - mv valkey-${1} valkey-stable; - tar cvzf valkey-stable.tar.gz valkey-stable; - rm -rf valkey-${1}.tar.gz; - shasum -a 256 valkey-stable.tar.gz > valkey-stable.tar.gz.SHA256SUM; - " diff --git a/utils/releasetools/03_test_release.sh b/utils/releasetools/03_test_release.sh deleted file mode 100755 index 2480d8cfd6..0000000000 --- a/utils/releasetools/03_test_release.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/sh -set -e -if [ $# != "1" ] -then - echo "Usage: ./utils/releasetools/03_test_release.sh " - exit 1 -fi - -TAG=$1 -TARNAME="valkey-${TAG}.tar.gz" -DOWNLOADURL="http://download.redis.io/releases/${TARNAME}" - -echo "Doing sanity test on the actual tarball" - -cd /tmp -rm -rf test_release_tmp_dir -mkdir test_release_tmp_dir -cd test_release_tmp_dir -rm -f $TARNAME -rm -rf valkey-${TAG} -wget $DOWNLOADURL -tar xvzf $TARNAME -cd valkey-${TAG} -make -./runtest -./runtest-sentinel -./runtest-cluster -./runtest-moduleapi diff --git a/utils/releasetools/04_release_hash.sh b/utils/releasetools/04_release_hash.sh deleted file mode 100755 index 8be286fabc..0000000000 --- a/utils/releasetools/04_release_hash.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -if [ $# != "1" ] -then - echo "Usage: ./utils/releasetools/04_release_hash.sh " - exit 1 -fi - -SHA=$(curl -s http://download.redis.io/releases/redis-${1}.tar.gz | shasum -a 256 | cut -f 1 -d' ') -ENTRY="hash valkey-${1}.tar.gz sha256 $SHA http://download.redis.io/releases/redis-${1}.tar.gz" -echo $ENTRY >> ../valkey-hashes/README -echo "Press any key to commit, Ctrl-C to abort)." -read yes -(cd ../valkey-hashes; git commit -a -m "${1} hash."; git push) diff --git a/utils/releasetools/changelog.tcl b/utils/releasetools/changelog.tcl deleted file mode 100755 index e2f8d4364a..0000000000 --- a/utils/releasetools/changelog.tcl +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env tclsh - -if {[llength $::argv] != 2 && [llength $::argv] != 3} { - puts "Usage: $::argv0 \[\]" - exit 1 -} - -set branch [lindex $::argv 0] -set ver [lindex $::argv 1] -if {[llength $::argv] == 3} { - set count [lindex ::$argv 2] -} else { - set count 100 -} - -set template { -================================================================================ -Valkey %ver% Released %date% -================================================================================ - -Upgrade urgency : -} - -set template [string trim $template] -append template "\n\n" -set date [clock format [clock seconds]] -set template [string map [list %ver% $ver %date% $date] $template] - -append template [exec git log $branch~$count..$branch "--format=format:%an in commit %h:%n %s" --shortstat] - -#Older, more verbose version. -# -#append template [exec git log $branch~30..$branch "--format=format:+-------------------------------------------------------------------------------%n| %s%n| By %an, %ai%n+--------------------------------------------------------------------------------%nhttps://github.com/redis/redis/commit/%H%n%n%b" --stat] - -puts $template From cb7f2759a638cad3a579a8290ad445fbbd83c7b8 Mon Sep 17 00:00:00 2001 From: Ricardo Dias Date: Thu, 2 Jan 2025 17:35:10 +0000 Subject: [PATCH 048/101] Refactor: move all valkey modules related declarations to `module.h` (#1489) In this commit we move all structures and functions declarations related to Valkey modules from `server.h` to the recently added `module.h` file. This re-organization makes it easier for new contributors to find the valkey modules related code, as well as reducing the compilation times when changes are made to the modules code. --------- Signed-off-by: Ricardo Dias --- src/acl.c | 1 + src/aof.c | 3 +- src/blocked.c | 1 + src/cluster.c | 1 + src/cluster_legacy.c | 1 + src/config.c | 24 +---- src/db.c | 1 + src/debug.c | 3 +- src/defrag.c | 1 + src/functions.h | 1 + src/lazyfree.c | 1 + src/module.c | 40 +++++++- src/module.h | 219 +++++++++++++++++++++++++++++++++++++++- src/networking.c | 1 + src/notify.c | 1 + src/object.c | 1 + src/rdb.c | 9 +- src/replication.c | 1 + src/script.c | 1 + src/server.c | 1 + src/server.h | 222 +---------------------------------------- src/valkey-check-rdb.c | 1 + 22 files changed, 283 insertions(+), 252 deletions(-) diff --git a/src/acl.c b/src/acl.c index d1f970a805..725419dcf2 100644 --- a/src/acl.c +++ b/src/acl.c @@ -29,6 +29,7 @@ #include "server.h" #include "sha256.h" +#include "module.h" #include #include diff --git a/src/aof.c b/src/aof.c index 8af3a9928f..8ac44f64c2 100644 --- a/src/aof.c +++ b/src/aof.c @@ -31,6 +31,7 @@ #include "bio.h" #include "rio.h" #include "functions.h" +#include "module.h" #include #include @@ -2167,7 +2168,7 @@ int rewriteModuleObject(rio *r, robj *key, robj *o, int dbid) { ValkeyModuleIO io; moduleValue *mv = o->ptr; moduleType *mt = mv->type; - moduleInitIOContext(io, mt, r, key, dbid); + moduleInitIOContext(&io, mt, r, key, dbid); mt->aof_rewrite(&io, key, mv->value); if (io.ctx) { moduleFreeContext(io.ctx); diff --git a/src/blocked.c b/src/blocked.c index d356ea5c07..39050932d9 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -65,6 +65,7 @@ #include "latency.h" #include "monotonic.h" #include "cluster_slot_stats.h" +#include "module.h" /* forward declarations */ static void unblockClientWaitingData(client *c); diff --git a/src/cluster.c b/src/cluster.c index df6bb86454..39d9161b9c 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -36,6 +36,7 @@ #include "server.h" #include "cluster.h" #include "cluster_slot_stats.h" +#include "module.h" #include diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 80889a79d8..a1b1d0e986 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -38,6 +38,7 @@ #include "cluster_slot_stats.h" #include "endianconv.h" #include "connection.h" +#include "module.h" #include #include diff --git a/src/config.c b/src/config.c index 59cf0d9400..0b459bb6e5 100644 --- a/src/config.c +++ b/src/config.c @@ -32,6 +32,7 @@ #include "cluster.h" #include "connection.h" #include "bio.h" +#include "module.h" #include #include @@ -371,20 +372,6 @@ void resetServerSaveParams(void) { server.saveparamslen = 0; } -void queueLoadModule(sds path, sds *argv, int argc) { - int i; - struct moduleLoadQueueEntry *loadmod; - - loadmod = zmalloc(sizeof(struct moduleLoadQueueEntry)); - loadmod->argv = argc ? zmalloc(sizeof(robj *) * argc) : NULL; - loadmod->path = sdsnew(path); - loadmod->argc = argc; - for (i = 0; i < argc; i++) { - loadmod->argv[i] = createRawStringObject(argv[i], sdslen(argv[i])); - } - listAddNodeTail(server.loadmodule_queue, loadmod); -} - /* Parse an array of `arg_len` sds strings, validate and populate * server.client_obuf_limits if valid. * Used in CONFIG SET and configuration file parsing. */ @@ -567,7 +554,7 @@ void loadServerConfigFromString(char *config) { goto loaderr; } } else if (!strcasecmp(argv[0], "loadmodule") && argc >= 2) { - queueLoadModule(argv[1], &argv[2], argc - 2); + moduleEnqueueLoadModule(argv[1], &argv[2], argc - 2); } else if (strchr(argv[0], '.')) { if (argc < 2) { err = "Module config specified without value"; @@ -1583,12 +1570,7 @@ void rewriteConfigLoadmoduleOption(struct rewriteConfigState *state) { dictEntry *de; while ((de = dictNext(di)) != NULL) { struct ValkeyModule *module = dictGetVal(de); - line = sdsnew("loadmodule "); - line = sdscatsds(line, module->loadmod->path); - for (int i = 0; i < module->loadmod->argc; i++) { - line = sdscatlen(line, " ", 1); - line = sdscatsds(line, module->loadmod->argv[i]->ptr); - } + line = moduleLoadQueueEntryToLoadmoduleOptionStr(module, "loadmodule"); rewriteConfigRewriteLine(state, "loadmodule", line, 1); } dictReleaseIterator(di); diff --git a/src/db.c b/src/db.c index e31d7e7f7f..1362b5f9dd 100644 --- a/src/db.c +++ b/src/db.c @@ -33,6 +33,7 @@ #include "script.h" #include "functions.h" #include "io_threads.h" +#include "module.h" #include #include diff --git a/src/debug.c b/src/debug.c index 4efe12e237..7e52874e30 100644 --- a/src/debug.c +++ b/src/debug.c @@ -38,6 +38,7 @@ #include "threads_mngr.h" #include "io_threads.h" #include "sds.h" +#include "module.h" #include #include @@ -263,7 +264,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) ValkeyModuleDigest md = {{0}, {0}, keyobj, db->id}; moduleValue *mv = o->ptr; moduleType *mt = mv->type; - moduleInitDigestContext(md); + moduleInitDigestContext(&md); if (mt->digest) { mt->digest(&md, mv->value); xorDigest(digest, md.x, sizeof(md.x)); diff --git a/src/defrag.c b/src/defrag.c index a5d6c69c1c..a755db559a 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -36,6 +36,7 @@ #include "server.h" #include "hashtable.h" #include "script.h" +#include "module.h" #include #ifdef HAVE_DEFRAG diff --git a/src/functions.h b/src/functions.h index 89e39fdc56..a48ff1b8db 100644 --- a/src/functions.h +++ b/src/functions.h @@ -55,6 +55,7 @@ typedef struct functionLibInfo functionLibInfo; /* ValkeyModule type aliases for scripting engine structs and types. */ +typedef struct ValkeyModule ValkeyModule; typedef ValkeyModuleScriptingEngineCtx engineCtx; typedef ValkeyModuleScriptingEngineFunctionCtx functionCtx; typedef ValkeyModuleScriptingEngineCompiledFunction compiledFunction; diff --git a/src/lazyfree.c b/src/lazyfree.c index 4b4c7f06ad..c22d3da964 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -2,6 +2,7 @@ #include "bio.h" #include "functions.h" #include "cluster.h" +#include "module.h" #include diff --git a/src/module.c b/src/module.c index a8676cb727..dabea59d49 100644 --- a/src/module.c +++ b/src/module.c @@ -63,6 +63,7 @@ #include "valkeymodule.h" #include "io_threads.h" #include "functions.h" +#include "module.h" #include #include #include @@ -75,6 +76,12 @@ * pointers that have an API the module can call with them) * -------------------------------------------------------------------------- */ +struct moduleLoadQueueEntry { + sds path; + int argc; + robj **argv; +}; + struct ValkeyModuleInfoCtx { struct ValkeyModule *module; dict *requested_sections; @@ -644,6 +651,35 @@ void *VM_PoolAlloc(ValkeyModuleCtx *ctx, size_t bytes) { * Helpers for modules API implementation * -------------------------------------------------------------------------- */ +void moduleEnqueueLoadModule(sds path, sds *argv, int argc) { + int i; + struct moduleLoadQueueEntry *loadmod; + + loadmod = zmalloc(sizeof(struct moduleLoadQueueEntry)); + loadmod->argv = argc ? zmalloc(sizeof(robj *) * argc) : NULL; + loadmod->path = sdsnew(path); + loadmod->argc = argc; + for (i = 0; i < argc; i++) { + loadmod->argv[i] = createRawStringObject(argv[i], sdslen(argv[i])); + } + listAddNodeTail(server.loadmodule_queue, loadmod); +} + +sds moduleLoadQueueEntryToLoadmoduleOptionStr(ValkeyModule *module, + const char *config_option_str) { + sds line; + + line = sdsnew(config_option_str); + line = sdscatlen(line, " ", 1); + line = sdscatsds(line, module->loadmod->path); + for (int i = 0; i < module->loadmod->argc; i++) { + line = sdscatlen(line, " ", 1); + line = sdscatsds(line, module->loadmod->argv[i]->ptr); + } + + return line; +} + client *moduleAllocTempClient(void) { client *c = NULL; @@ -7401,7 +7437,7 @@ void *VM_LoadDataTypeFromStringEncver(const ValkeyModuleString *str, const modul void *ret; rioInitWithBuffer(&payload, str->ptr); - moduleInitIOContext(io, (moduleType *)mt, &payload, NULL, -1); + moduleInitIOContext(&io, (moduleType *)mt, &payload, NULL, -1); /* All VM_Save*() calls always write a version 2 compatible format, so we * need to make sure we read the same. @@ -7433,7 +7469,7 @@ ValkeyModuleString *VM_SaveDataTypeToString(ValkeyModuleCtx *ctx, void *data, co ValkeyModuleIO io; rioInitWithBuffer(&payload, sdsempty()); - moduleInitIOContext(io, (moduleType *)mt, &payload, NULL, -1); + moduleInitIOContext(&io, (moduleType *)mt, &payload, NULL, -1); mt->rdb_save(&io, data); if (io.ctx) { moduleFreeContext(io.ctx); diff --git a/src/module.h b/src/module.h index f61ef1e3cb..78d9341ca9 100644 --- a/src/module.h +++ b/src/module.h @@ -5,13 +5,228 @@ * not part of the module API, but are used by the core to interact with modules */ -typedef struct ValkeyModuleCtx ValkeyModuleCtx; -typedef struct ValkeyModule ValkeyModule; +/* Extract encver / signature from a module type ID. */ +#define VALKEYMODULE_TYPE_ENCVER_BITS 10 +#define VALKEYMODULE_TYPE_ENCVER_MASK ((1 << VALKEYMODULE_TYPE_ENCVER_BITS) - 1) +#define VALKEYMODULE_TYPE_ENCVER(id) ((id) & VALKEYMODULE_TYPE_ENCVER_MASK) +#define VALKEYMODULE_TYPE_SIGN(id) \ + (((id) & ~((uint64_t)VALKEYMODULE_TYPE_ENCVER_MASK)) >> VALKEYMODULE_TYPE_ENCVER_BITS) +/* Bit flags for moduleTypeAuxSaveFunc */ +#define VALKEYMODULE_AUX_BEFORE_RDB (1 << 0) +#define VALKEYMODULE_AUX_AFTER_RDB (1 << 1) + +struct ValkeyModule; +struct ValkeyModuleIO; +struct ValkeyModuleDigest; +struct ValkeyModuleCtx; +struct moduleLoadQueueEntry; +struct ValkeyModuleKeyOptCtx; +struct ValkeyModuleCommand; +struct clusterState; + +/* Each module type implementation should export a set of methods in order + * to serialize and deserialize the value in the RDB file, rewrite the AOF + * log, create the digest for "DEBUG DIGEST", and free the value when a key + * is deleted. */ +typedef void *(*moduleTypeLoadFunc)(struct ValkeyModuleIO *io, int encver); +typedef void (*moduleTypeSaveFunc)(struct ValkeyModuleIO *io, void *value); +typedef int (*moduleTypeAuxLoadFunc)(struct ValkeyModuleIO *rdb, int encver, int when); +typedef void (*moduleTypeAuxSaveFunc)(struct ValkeyModuleIO *rdb, int when); +typedef void (*moduleTypeRewriteFunc)(struct ValkeyModuleIO *io, struct serverObject *key, void *value); +typedef void (*moduleTypeDigestFunc)(struct ValkeyModuleDigest *digest, void *value); +typedef size_t (*moduleTypeMemUsageFunc)(const void *value); +typedef void (*moduleTypeFreeFunc)(void *value); +typedef size_t (*moduleTypeFreeEffortFunc)(struct serverObject *key, const void *value); +typedef void (*moduleTypeUnlinkFunc)(struct serverObject *key, void *value); +typedef void *(*moduleTypeCopyFunc)(struct serverObject *fromkey, struct serverObject *tokey, const void *value); +typedef int (*moduleTypeDefragFunc)(struct ValkeyModuleDefragCtx *ctx, struct serverObject *key, void **value); +typedef size_t (*moduleTypeMemUsageFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value, size_t sample_size); +typedef void (*moduleTypeFreeFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value); +typedef size_t (*moduleTypeFreeEffortFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value); +typedef void (*moduleTypeUnlinkFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value); +typedef void *(*moduleTypeCopyFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value); +typedef int (*moduleTypeAuthCallback)(struct ValkeyModuleCtx *ctx, void *username, void *password, const char **err); + + +/* The module type, which is referenced in each value of a given type, defines + * the methods and links to the module exporting the type. */ +typedef struct ValkeyModuleType { + uint64_t id; /* Higher 54 bits of type ID + 10 lower bits of encoding ver. */ + struct ValkeyModule *module; + moduleTypeLoadFunc rdb_load; + moduleTypeSaveFunc rdb_save; + moduleTypeRewriteFunc aof_rewrite; + moduleTypeMemUsageFunc mem_usage; + moduleTypeDigestFunc digest; + moduleTypeFreeFunc free; + moduleTypeFreeEffortFunc free_effort; + moduleTypeUnlinkFunc unlink; + moduleTypeCopyFunc copy; + moduleTypeDefragFunc defrag; + moduleTypeAuxLoadFunc aux_load; + moduleTypeAuxSaveFunc aux_save; + moduleTypeMemUsageFunc2 mem_usage2; + moduleTypeFreeEffortFunc2 free_effort2; + moduleTypeUnlinkFunc2 unlink2; + moduleTypeCopyFunc2 copy2; + moduleTypeAuxSaveFunc aux_save2; + int aux_save_triggers; + char name[10]; /* 9 bytes name + null term. Charset: A-Z a-z 0-9 _- */ +} moduleType; + +/* In Object 'robj' structures of type OBJ_MODULE, the value pointer + * is set to the following structure, referencing the moduleType structure + * in order to work with the value, and at the same time providing a raw + * pointer to the value, as created by the module commands operating with + * the module type. + * + * So for example in order to free such a value, it is possible to use + * the following code: + * + * if (robj->type == OBJ_MODULE) { + * moduleValue *mt = robj->ptr; + * mt->type->free(mt->value); + * zfree(mt); // We need to release this in-the-middle struct as well. + * } + */ +typedef struct moduleValue { + moduleType *type; + void *value; +} moduleValue; + +/* This structure represents a module inside the system. */ +typedef struct ValkeyModule { + void *handle; /* Module dlopen() handle. */ + char *name; /* Module name. */ + int ver; /* Module version. We use just progressive integers. */ + int apiver; /* Module API version as requested during initialization.*/ + list *types; /* Module data types. */ + list *usedby; /* List of modules using APIs from this one. */ + list *using; /* List of modules we use some APIs of. */ + list *filters; /* List of filters the module has registered. */ + list *module_configs; /* List of configurations the module has registered */ + int configs_initialized; /* Have the module configurations been initialized? */ + int in_call; /* RM_Call() nesting level */ + int in_hook; /* Hooks callback nesting level for this module (0 or 1). */ + int options; /* Module options and capabilities. */ + int blocked_clients; /* Count of ValkeyModuleBlockedClient in this module. */ + ValkeyModuleInfoFunc info_cb; /* Callback for module to add INFO fields. */ + ValkeyModuleDefragFunc defrag_cb; /* Callback for global data defrag. */ + struct moduleLoadQueueEntry *loadmod; /* Module load arguments for config rewrite. */ + int num_commands_with_acl_categories; /* Number of commands in this module included in acl categories */ + int onload; /* Flag to identify if the call is being made from Onload (0 or 1) */ + size_t num_acl_categories_added; /* Number of acl categories added by this module. */ +} ValkeyModule; + +/* This is a wrapper for the 'rio' streams used inside rdb.c in the server, so that + * the user does not have to take the total count of the written bytes nor + * to care about error conditions. */ +typedef struct ValkeyModuleIO { + size_t bytes; /* Bytes read / written so far. */ + rio *rio; /* Rio stream. */ + moduleType *type; /* Module type doing the operation. */ + int error; /* True if error condition happened. */ + ValkeyModuleCtx *ctx; /* Optional context, see RM_GetContextFromIO()*/ + robj *key; /* Optional name of key processed */ + int dbid; /* The dbid of the key being processed, -1 when unknown. */ + sds pre_flush_buffer; /* A buffer that should be flushed before next write operation + * See rdbSaveSingleModuleAux for more details */ +} ValkeyModuleIO; + +/* Macro to initialize an IO context. Note that the 'ver' field is populated + * inside rdb.c according to the version of the value to load. */ +static inline void moduleInitIOContext(ValkeyModuleIO *iovar, + moduleType *mtype, + rio *rioptr, + robj *keyptr, + int db) { + iovar->rio = rioptr; + iovar->type = mtype; + iovar->bytes = 0; + iovar->error = 0; + iovar->key = keyptr; + iovar->dbid = db; + iovar->ctx = NULL; + iovar->pre_flush_buffer = NULL; +} + +/* This is a structure used to export DEBUG DIGEST capabilities to + * modules. We want to capture both the ordered and unordered elements of + * a data structure, so that a digest can be created in a way that correctly + * reflects the values. See the DEBUG DIGEST command implementation for more + * background. */ +typedef struct ValkeyModuleDigest { + unsigned char o[20]; /* Ordered elements. */ + unsigned char x[20]; /* Xored elements. */ + robj *key; /* Optional name of key processed */ + int dbid; /* The dbid of the key being processed */ +} ValkeyModuleDigest; + +/* Just start with a digest composed of all zero bytes. */ +static inline void moduleInitDigestContext(ValkeyModuleDigest *mdvar) { + memset(mdvar->o, 0, sizeof(mdvar->o)); + memset(mdvar->x, 0, sizeof(mdvar->x)); +} + +void moduleEnqueueLoadModule(sds path, sds *argv, int argc); +sds moduleLoadQueueEntryToLoadmoduleOptionStr(ValkeyModule *module, + const char *config_option_str); ValkeyModuleCtx *moduleAllocateContext(void); void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx, ValkeyModule *module, client *client); void moduleFreeContext(ValkeyModuleCtx *ctx); +void moduleInitModulesSystem(void); +void moduleInitModulesSystemLast(void); +void modulesCron(void); +int moduleLoad(const char *path, void **argv, int argc, int is_loadex); +int moduleUnload(sds name, const char **errmsg); +void moduleLoadFromQueue(void); +int moduleGetCommandKeysViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result); +int moduleGetCommandChannelsViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result); +moduleType *moduleTypeLookupModuleByID(uint64_t id); +moduleType *moduleTypeLookupModuleByName(const char *name); +moduleType *moduleTypeLookupModuleByNameIgnoreCase(const char *name); +void moduleTypeNameByID(char *name, uint64_t moduleid); +const char *moduleTypeModuleName(moduleType *mt); +const char *moduleNameFromCommand(struct serverCommand *cmd); +void moduleFreeContext(ValkeyModuleCtx *ctx); +void moduleCallCommandUnblockedHandler(client *c); +int isModuleClientUnblocked(client *c); +void unblockClientFromModule(client *c); +void moduleHandleBlockedClients(void); +void moduleBlockedClientTimedOut(client *c, int from_module); +void modulePipeReadable(aeEventLoop *el, int fd, void *privdata, int mask); +size_t moduleCount(void); +void moduleAcquireGIL(void); +int moduleTryAcquireGIL(void); +void moduleReleaseGIL(void); +void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid); +void firePostExecutionUnitJobs(void); +void moduleCallCommandFilters(client *c); +void modulePostExecutionUnitOperations(void); +void ModuleForkDoneHandler(int exitcode, int bysignal); +int TerminateModuleForkChild(int child_pid, int wait); +ssize_t rdbSaveModulesAux(rio *rdb, int when); +int moduleAllDatatypesHandleErrors(void); +int moduleAllModulesHandleReplAsyncLoad(void); +sds modulesCollectInfo(sds info, dict *sections_dict, int for_crash_report, int sections); +void moduleFireServerEvent(uint64_t eid, int subid, void *data); +void processModuleLoadingProgressEvent(int is_aof); +int moduleTryServeClientBlockedOnKey(client *c, robj *key); +void moduleUnblockClient(client *c); +int moduleBlockedClientMayTimeout(client *c); +int moduleClientIsBlockedOnKeys(client *c); +void moduleNotifyUserChanged(client *c); +void moduleNotifyKeyUnlink(robj *key, robj *val, int dbid, int flags); +size_t moduleGetFreeEffort(robj *key, robj *val, int dbid); +size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid); +robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value); +int moduleDefragValue(robj *key, robj *obj, int dbid); +int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid); +void moduleDefragGlobals(void); +void *moduleGetHandleByName(char *modulename); +int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd); #endif /* _MODULE_H_ */ diff --git a/src/networking.c b/src/networking.c index 2190fca5bf..08e9a56313 100644 --- a/src/networking.c +++ b/src/networking.c @@ -35,6 +35,7 @@ #include "fpconv_dtoa.h" #include "fmtargs.h" #include "io_threads.h" +#include "module.h" #include #include #include diff --git a/src/notify.c b/src/notify.c index c655457e8b..d10d7dd9b9 100644 --- a/src/notify.c +++ b/src/notify.c @@ -28,6 +28,7 @@ */ #include "server.h" +#include "module.h" /* This file implements keyspace events notification via Pub/Sub and * described at https://valkey.io/topics/notifications */ diff --git a/src/object.c b/src/object.c index 15363f31b8..637b25e30c 100644 --- a/src/object.c +++ b/src/object.c @@ -34,6 +34,7 @@ #include "intset.h" /* Compact integer set structure */ #include "zmalloc.h" #include "sds.h" +#include "module.h" #include #include diff --git a/src/rdb.c b/src/rdb.c index a4eb2823fb..958eac5d4f 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -37,6 +37,7 @@ #include "intset.h" /* Compact integer set structure */ #include "bio.h" #include "zmalloc.h" +#include "module.h" #include #include @@ -1098,7 +1099,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { * to call the right module during loading. */ int retval = rdbSaveLen(rdb, mt->id); if (retval == -1) return -1; - moduleInitIOContext(io, mt, rdb, key, dbid); + moduleInitIOContext(&io, mt, rdb, key, dbid); io.bytes += retval; /* Then write the module-specific representation + EOF marker. */ @@ -1242,7 +1243,7 @@ ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt) { /* Save a module-specific aux value. */ ValkeyModuleIO io; int retval = 0; - moduleInitIOContext(io, mt, rdb, NULL, -1); + moduleInitIOContext(&io, mt, rdb, NULL, -1); /* We save the AUX field header in a temporary buffer so we can support aux_save2 API. * If aux_save2 is used the buffer will be flushed at the first time the module will perform @@ -2795,7 +2796,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { ValkeyModuleIO io; robj keyobj; initStaticStringObject(keyobj, key); - moduleInitIOContext(io, mt, rdb, &keyobj, dbid); + moduleInitIOContext(&io, mt, rdb, &keyobj, dbid); /* Call the rdb_load method of the module providing the 10 bit * encoding version in the lower 10 bits of the module ID. */ void *ptr = mt->rdb_load(&io, moduleid & 1023); @@ -3221,7 +3222,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin } ValkeyModuleIO io; - moduleInitIOContext(io, mt, rdb, NULL, -1); + moduleInitIOContext(&io, mt, rdb, NULL, -1); /* Call the rdb_load method of the module providing the 10 bit * encoding version in the lower 10 bits of the module ID. */ int rc = mt->aux_load(&io, moduleid & 1023, when); diff --git a/src/replication.c b/src/replication.c index bec52a84d0..c5611d5a5a 100644 --- a/src/replication.c +++ b/src/replication.c @@ -35,6 +35,7 @@ #include "bio.h" #include "functions.h" #include "connection.h" +#include "module.h" #include #include diff --git a/src/script.c b/src/script.c index f1d0a8fb79..f342d496fc 100644 --- a/src/script.c +++ b/src/script.c @@ -31,6 +31,7 @@ #include "script.h" #include "cluster.h" #include "cluster_slot_stats.h" +#include "module.h" scriptFlag scripts_flags_def[] = { {.flag = SCRIPT_FLAG_NO_WRITES, .str = "no-writes"}, diff --git a/src/server.c b/src/server.c index b997a9aec6..e53e7ff552 100644 --- a/src/server.c +++ b/src/server.c @@ -42,6 +42,7 @@ #include "fmtargs.h" #include "io_threads.h" #include "sds.h" +#include "module.h" #include #include diff --git a/src/server.h b/src/server.h index 582392bca0..b65488aab8 100644 --- a/src/server.h +++ b/src/server.h @@ -701,168 +701,7 @@ typedef enum { #define OBJ_STREAM 6 /* Stream object. */ #define OBJ_TYPE_MAX 7 /* Maximum number of object types */ -/* Extract encver / signature from a module type ID. */ -#define VALKEYMODULE_TYPE_ENCVER_BITS 10 -#define VALKEYMODULE_TYPE_ENCVER_MASK ((1 << VALKEYMODULE_TYPE_ENCVER_BITS) - 1) -#define VALKEYMODULE_TYPE_ENCVER(id) ((id) & VALKEYMODULE_TYPE_ENCVER_MASK) -#define VALKEYMODULE_TYPE_SIGN(id) \ - (((id) & ~((uint64_t)VALKEYMODULE_TYPE_ENCVER_MASK)) >> VALKEYMODULE_TYPE_ENCVER_BITS) - -/* Bit flags for moduleTypeAuxSaveFunc */ -#define VALKEYMODULE_AUX_BEFORE_RDB (1 << 0) -#define VALKEYMODULE_AUX_AFTER_RDB (1 << 1) - -struct ValkeyModule; -struct ValkeyModuleIO; -struct ValkeyModuleDigest; -struct ValkeyModuleCtx; -struct moduleLoadQueueEntry; -struct ValkeyModuleKeyOptCtx; -struct ValkeyModuleCommand; -struct clusterState; - -/* Each module type implementation should export a set of methods in order - * to serialize and deserialize the value in the RDB file, rewrite the AOF - * log, create the digest for "DEBUG DIGEST", and free the value when a key - * is deleted. */ -typedef void *(*moduleTypeLoadFunc)(struct ValkeyModuleIO *io, int encver); -typedef void (*moduleTypeSaveFunc)(struct ValkeyModuleIO *io, void *value); -typedef int (*moduleTypeAuxLoadFunc)(struct ValkeyModuleIO *rdb, int encver, int when); -typedef void (*moduleTypeAuxSaveFunc)(struct ValkeyModuleIO *rdb, int when); -typedef void (*moduleTypeRewriteFunc)(struct ValkeyModuleIO *io, struct serverObject *key, void *value); -typedef void (*moduleTypeDigestFunc)(struct ValkeyModuleDigest *digest, void *value); -typedef size_t (*moduleTypeMemUsageFunc)(const void *value); -typedef void (*moduleTypeFreeFunc)(void *value); -typedef size_t (*moduleTypeFreeEffortFunc)(struct serverObject *key, const void *value); -typedef void (*moduleTypeUnlinkFunc)(struct serverObject *key, void *value); -typedef void *(*moduleTypeCopyFunc)(struct serverObject *fromkey, struct serverObject *tokey, const void *value); -typedef int (*moduleTypeDefragFunc)(struct ValkeyModuleDefragCtx *ctx, struct serverObject *key, void **value); -typedef size_t (*moduleTypeMemUsageFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value, size_t sample_size); -typedef void (*moduleTypeFreeFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value); -typedef size_t (*moduleTypeFreeEffortFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value); -typedef void (*moduleTypeUnlinkFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value); -typedef void *(*moduleTypeCopyFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value); -typedef int (*moduleTypeAuthCallback)(struct ValkeyModuleCtx *ctx, void *username, void *password, const char **err); - - -/* The module type, which is referenced in each value of a given type, defines - * the methods and links to the module exporting the type. */ -typedef struct ValkeyModuleType { - uint64_t id; /* Higher 54 bits of type ID + 10 lower bits of encoding ver. */ - struct ValkeyModule *module; - moduleTypeLoadFunc rdb_load; - moduleTypeSaveFunc rdb_save; - moduleTypeRewriteFunc aof_rewrite; - moduleTypeMemUsageFunc mem_usage; - moduleTypeDigestFunc digest; - moduleTypeFreeFunc free; - moduleTypeFreeEffortFunc free_effort; - moduleTypeUnlinkFunc unlink; - moduleTypeCopyFunc copy; - moduleTypeDefragFunc defrag; - moduleTypeAuxLoadFunc aux_load; - moduleTypeAuxSaveFunc aux_save; - moduleTypeMemUsageFunc2 mem_usage2; - moduleTypeFreeEffortFunc2 free_effort2; - moduleTypeUnlinkFunc2 unlink2; - moduleTypeCopyFunc2 copy2; - moduleTypeAuxSaveFunc aux_save2; - int aux_save_triggers; - char name[10]; /* 9 bytes name + null term. Charset: A-Z a-z 0-9 _- */ -} moduleType; - -/* In Object 'robj' structures of type OBJ_MODULE, the value pointer - * is set to the following structure, referencing the moduleType structure - * in order to work with the value, and at the same time providing a raw - * pointer to the value, as created by the module commands operating with - * the module type. - * - * So for example in order to free such a value, it is possible to use - * the following code: - * - * if (robj->type == OBJ_MODULE) { - * moduleValue *mt = robj->ptr; - * mt->type->free(mt->value); - * zfree(mt); // We need to release this in-the-middle struct as well. - * } - */ -typedef struct moduleValue { - moduleType *type; - void *value; -} moduleValue; - -/* This structure represents a module inside the system. */ -struct ValkeyModule { - void *handle; /* Module dlopen() handle. */ - char *name; /* Module name. */ - int ver; /* Module version. We use just progressive integers. */ - int apiver; /* Module API version as requested during initialization.*/ - list *types; /* Module data types. */ - list *usedby; /* List of modules using APIs from this one. */ - list *using; /* List of modules we use some APIs of. */ - list *filters; /* List of filters the module has registered. */ - list *module_configs; /* List of configurations the module has registered */ - int configs_initialized; /* Have the module configurations been initialized? */ - int in_call; /* RM_Call() nesting level */ - int in_hook; /* Hooks callback nesting level for this module (0 or 1). */ - int options; /* Module options and capabilities. */ - int blocked_clients; /* Count of ValkeyModuleBlockedClient in this module. */ - ValkeyModuleInfoFunc info_cb; /* Callback for module to add INFO fields. */ - ValkeyModuleDefragFunc defrag_cb; /* Callback for global data defrag. */ - struct moduleLoadQueueEntry *loadmod; /* Module load arguments for config rewrite. */ - int num_commands_with_acl_categories; /* Number of commands in this module included in acl categories */ - int onload; /* Flag to identify if the call is being made from Onload (0 or 1) */ - size_t num_acl_categories_added; /* Number of acl categories added by this module. */ -}; -typedef struct ValkeyModule ValkeyModule; - -/* This is a wrapper for the 'rio' streams used inside rdb.c in the server, so that - * the user does not have to take the total count of the written bytes nor - * to care about error conditions. */ -struct ValkeyModuleIO { - size_t bytes; /* Bytes read / written so far. */ - rio *rio; /* Rio stream. */ - moduleType *type; /* Module type doing the operation. */ - int error; /* True if error condition happened. */ - struct ValkeyModuleCtx *ctx; /* Optional context, see RM_GetContextFromIO()*/ - struct serverObject *key; /* Optional name of key processed */ - int dbid; /* The dbid of the key being processed, -1 when unknown. */ - sds pre_flush_buffer; /* A buffer that should be flushed before next write operation - * See rdbSaveSingleModuleAux for more details */ -}; - -/* Macro to initialize an IO context. Note that the 'ver' field is populated - * inside rdb.c according to the version of the value to load. */ -#define moduleInitIOContext(iovar, mtype, rioptr, keyptr, db) \ - do { \ - iovar.rio = rioptr; \ - iovar.type = mtype; \ - iovar.bytes = 0; \ - iovar.error = 0; \ - iovar.key = keyptr; \ - iovar.dbid = db; \ - iovar.ctx = NULL; \ - iovar.pre_flush_buffer = NULL; \ - } while (0) - -/* This is a structure used to export DEBUG DIGEST capabilities to - * modules. We want to capture both the ordered and unordered elements of - * a data structure, so that a digest can be created in a way that correctly - * reflects the values. See the DEBUG DIGEST command implementation for more - * background. */ -struct ValkeyModuleDigest { - unsigned char o[20]; /* Ordered elements. */ - unsigned char x[20]; /* Xored elements. */ - struct serverObject *key; /* Optional name of key processed */ - int dbid; /* The dbid of the key being processed */ -}; - -/* Just start with a digest composed of all zero bytes. */ -#define moduleInitDigestContext(mdvar) \ - do { \ - memset(mdvar.o, 0, sizeof(mdvar.o)); \ - memset(mdvar.x, 0, sizeof(mdvar.x)); \ - } while (0) +typedef struct ValkeyModuleType moduleType; /* Macro to check if the client is in the middle of module based authentication. */ #define clientHasModuleAuthInProgress(c) ((c)->module_auth_ctx != NULL) @@ -1418,12 +1257,6 @@ struct saveparam { int changes; }; -struct moduleLoadQueueEntry { - sds path; - int argc; - robj **argv; -}; - struct sentinelLoadQueueEntry { int argc; sds *argv; @@ -2717,59 +2550,6 @@ extern dict *modules; /* Command metadata */ void populateCommandLegacyRangeSpec(struct serverCommand *c); -/* Modules */ -void moduleInitModulesSystem(void); -void moduleInitModulesSystemLast(void); -void modulesCron(void); -int moduleLoad(const char *path, void **argv, int argc, int is_loadex); -int moduleUnload(sds name, const char **errmsg); -void moduleLoadFromQueue(void); -int moduleGetCommandKeysViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result); -int moduleGetCommandChannelsViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result); -moduleType *moduleTypeLookupModuleByID(uint64_t id); -moduleType *moduleTypeLookupModuleByName(const char *name); -moduleType *moduleTypeLookupModuleByNameIgnoreCase(const char *name); -void moduleTypeNameByID(char *name, uint64_t moduleid); -const char *moduleTypeModuleName(moduleType *mt); -const char *moduleNameFromCommand(struct serverCommand *cmd); -void moduleFreeContext(struct ValkeyModuleCtx *ctx); -void moduleCallCommandUnblockedHandler(client *c); -int isModuleClientUnblocked(client *c); -void unblockClientFromModule(client *c); -void moduleHandleBlockedClients(void); -void moduleBlockedClientTimedOut(client *c, int from_module); -void modulePipeReadable(aeEventLoop *el, int fd, void *privdata, int mask); -size_t moduleCount(void); -void moduleAcquireGIL(void); -int moduleTryAcquireGIL(void); -void moduleReleaseGIL(void); -void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid); -void firePostExecutionUnitJobs(void); -void moduleCallCommandFilters(client *c); -void modulePostExecutionUnitOperations(void); -void ModuleForkDoneHandler(int exitcode, int bysignal); -int TerminateModuleForkChild(int child_pid, int wait); -ssize_t rdbSaveModulesAux(rio *rdb, int when); -int moduleAllDatatypesHandleErrors(void); -int moduleAllModulesHandleReplAsyncLoad(void); -sds modulesCollectInfo(sds info, dict *sections_dict, int for_crash_report, int sections); -void moduleFireServerEvent(uint64_t eid, int subid, void *data); -void processModuleLoadingProgressEvent(int is_aof); -int moduleTryServeClientBlockedOnKey(client *c, robj *key); -void moduleUnblockClient(client *c); -int moduleBlockedClientMayTimeout(client *c); -int moduleClientIsBlockedOnKeys(client *c); -void moduleNotifyUserChanged(client *c); -void moduleNotifyKeyUnlink(robj *key, robj *val, int dbid, int flags); -size_t moduleGetFreeEffort(robj *key, robj *val, int dbid); -size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid); -robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value); -int moduleDefragValue(robj *key, robj *obj, int dbid); -int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid); -void moduleDefragGlobals(void); -void *moduleGetHandleByName(char *modulename); -int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd); - /* Utils */ long long ustime(void); mstime_t mstime(void); diff --git a/src/valkey-check-rdb.c b/src/valkey-check-rdb.c index ba94c172c7..6f23c21fb8 100644 --- a/src/valkey-check-rdb.c +++ b/src/valkey-check-rdb.c @@ -30,6 +30,7 @@ #include "mt19937-64.h" #include "server.h" #include "rdb.h" +#include "module.h" #include #include From 4dcb2b706edf91be13f5c85530f11cb1e66b68ea Mon Sep 17 00:00:00 2001 From: Wen Hui Date: Thu, 2 Jan 2025 20:37:55 -0500 Subject: [PATCH 049/101] Update Redis legacy keyword and link in utils/whatisdoing.sh (#1495) Signed-off-by: hwware --- utils/whatisdoing.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/whatisdoing.sh b/utils/whatisdoing.sh index 68d7f7cca6..00117f4863 100755 --- a/utils/whatisdoing.sh +++ b/utils/whatisdoing.sh @@ -1,15 +1,15 @@ # This script is from http://poormansprofiler.org/ # -# NOTE: Instead of using this script, you should use the Redis +# NOTE: Instead of using this script, you should use the Valkey # Software Watchdog, which provides a similar functionality but in # a more reliable / easy to use way. # -# Check https://redis.io/topics/latency for more information. +# Check https://valkey.io/topics/latency for more information. #!/bin/bash nsamples=1 sleeptime=0 -pid=$(ps auxww | grep '[r]edis-server' | awk '{print $2}') +pid=$(ps auxww | grep '[v]alkey-server' | awk '{print $2}') for x in $(seq 1 $nsamples) do From 890bb71114358ec5473f5d78b5cd4a6913488126 Mon Sep 17 00:00:00 2001 From: gmbnomis Date: Fri, 3 Jan 2025 02:41:15 +0100 Subject: [PATCH 050/101] Use the correct command proc for the LOOKUP_NOTOUCH exception in lookupKey (#1499) When looking up a key in no-touch mode, `LOOKUP_NOTOUCH` is set to avoid updating the last access time in `lookupKey`. An exception must be made for the `TOUCH` command which must always update the key. When called from a script, `server.executing_client` will point to the `TOUCH` command, while `server.current_client` will point to e.g. an `EVAL` command. So, we must use the former to find out the currently executing command if defined. This fix addresses the issue where TOUCH wasn't updating key access times when called from scripts like EVAL. Fixes #1498 Signed-off-by: Simon Baatz Co-authored-by: Binbin --- src/db.c | 2 +- tests/unit/introspection-2.tcl | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/db.c b/src/db.c index 1362b5f9dd..9a53e6b4d1 100644 --- a/src/db.c +++ b/src/db.c @@ -125,7 +125,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { * Don't do it if we have a saving child, as this will trigger * a copy on write madness. */ if (server.current_client && server.current_client->flag.no_touch && - server.current_client->cmd->proc != touchCommand) + server.executing_client->cmd->proc != touchCommand) flags |= LOOKUP_NOTOUCH; if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) { /* Shared objects can't be stored in the database. */ diff --git a/tests/unit/introspection-2.tcl b/tests/unit/introspection-2.tcl index b8f4e0aed4..301c86937b 100644 --- a/tests/unit/introspection-2.tcl +++ b/tests/unit/introspection-2.tcl @@ -30,11 +30,24 @@ start_server {tags {"introspection"}} { assert {[r object idletime foo] >= 2} } - test {TOUCH alters the last access time of a key} { + proc test_touch_alters_access_time {} { r set foo bar + r set script_foo bar after 3000 r touch foo + r eval {redis.call('touch', KEYS[1])} 1 script_foo assert {[r object idletime foo] < 2} + assert {[r object idletime script_foo] < 2} + } + + test {TOUCH alters the last access time of a key} { + test_touch_alters_access_time + } + + test {TOUCH alters the last access time of a key in no-touch mode} { + r client no-touch on + test_touch_alters_access_time + r client no-touch off } test {Operations in no-touch mode do not alter the last access time of a key} { From 888ea5aeeb5c572a9118040a506f00e5a259ace9 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Thu, 2 Jan 2025 17:43:16 -0800 Subject: [PATCH 051/101] Move coverity back to ubuntu 22 until test failures are fixed (#1504) The issues in #1453 seem to have only shown up since we moved to ubuntu 24, as part of the rolling `ubunut-latest` migration from 22->24. Closes #1453. Signed-off-by: Madelyn Olson --- .github/workflows/coverity.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 2561e4ceb5..acf5686c13 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -17,7 +17,7 @@ permissions: jobs: coverity: if: github.repository == 'valkey-io/valkey' - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Download and extract the Coverity Build Tool From 3b085074ee894ce253d0e8415c9b53e8549dff43 Mon Sep 17 00:00:00 2001 From: eifrah-aws Date: Fri, 3 Jan 2025 03:44:41 +0200 Subject: [PATCH 052/101] CMake: fail on warnings (#1503) When building with `CMake` (especially the targets `valkey-cli`, `valkey-server` and `valkey-benchmark`) it is possible to have a successful build while having warnings. This PR fixes this - which is aligned with how the `Makefile` is working today: - Enable `-Wall` + `-Werror` for valkey targets - Fixed warning in valkey-cli:jsonStringOutput method Signed-off-by: Eran Ifrah --- cmake/Modules/ValkeySetup.cmake | 3 +++ src/valkey-cli.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake index 8a4d4da1c9..77360844fc 100644 --- a/cmake/Modules/ValkeySetup.cmake +++ b/cmake/Modules/ValkeySetup.cmake @@ -93,6 +93,9 @@ macro (valkey_build_and_install_bin target sources ld_flags libs link_name) target_link_libraries(${target} execinfo) endif () + # Enable all warnings + fail on warning + target_compile_options(${target} PRIVATE -Werror -Wall) + # Install cli tool and create a redis symbolic link valkey_install_bin(${target}) valkey_create_symlink(${target} ${link_name}) diff --git a/src/valkey-cli.c b/src/valkey-cli.c index 4416e09431..0a4f1affa2 100644 --- a/src/valkey-cli.c +++ b/src/valkey-cli.c @@ -2040,6 +2040,8 @@ static sds jsonStringOutput(sds out, const char *p, int len, int mode) { } else { assert(0); } + /* Silence compiler warning */ + return NULL; } static sds cliFormatReplyJson(sds out, redisReply *r, int mode) { From b95ad54c76cce569b8da3803a256f44e47a3cb88 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 4 Jan 2025 10:47:32 +0800 Subject: [PATCH 053/101] Explicitly check C_ERR condition to improve readability in clusterSaveConfig (#1505) It's not obvious to see it at first, modify it to use C_ERR. Signed-off-by: Binbin --- src/cluster_legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index a1b1d0e986..807488b57d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -868,7 +868,7 @@ int clusterSaveConfig(int do_fsync) { cleanup: if (fd != -1) close(fd); - if (retval) unlink(tmpfilename); + if (retval == C_ERR) unlink(tmpfilename); sdsfree(tmpfilename); sdsfree(ci); return retval; From 3072443b4ea05cb3e522654f898d144849b9a657 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 6 Jan 2025 09:26:17 +0800 Subject: [PATCH 054/101] Check whether to switch to fail when setting the node to pfail in cron (#1061) This may speed up the transition to the fail state a bit. Previously we would only check when we received a pfail/fail report from others in gossip. If myself is the last vote, we can directly switch to fail in here without waiting for the next gossip packet. Signed-off-by: Binbin --- src/cluster_legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 807488b57d..b59c30126a 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5217,7 +5217,7 @@ void clusterCron(void) { if (!(node->flags & (CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) { node->flags |= CLUSTER_NODE_PFAIL; update_state = 1; - if (server.cluster->size == 1 && clusterNodeIsVotingPrimary(myself)) { + if (clusterNodeIsVotingPrimary(myself)) { markNodeAsFailingIfNeeded(node); } else { serverLog(LL_NOTICE, "NODE %.40s (%s) possibly failing.", node->name, node->human_nodename); From bbd22eacfe333caba305e9d755def340461114df Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 6 Jan 2025 14:02:16 -0800 Subject: [PATCH 055/101] Fix Read/Write key pattern selector (CVE-2024-51741) (#1514) The explanation on the original commit was wrong. Key based access must have a `~` in order to correctly configure whey key prefixes to apply the selector to. If this is missing, a server assert will be triggered later. Signed-off-by: Madelyn Olson Co-authored-by: YaacovHazan --- src/acl.c | 11 ++++++++--- tests/unit/acl-v2.tcl | 23 ++++++++++++++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/acl.c b/src/acl.c index 725419dcf2..0928c43914 100644 --- a/src/acl.c +++ b/src/acl.c @@ -1078,19 +1078,24 @@ int ACLSetSelector(aclSelector *selector, const char *op, size_t oplen) { int flags = 0; size_t offset = 1; if (op[0] == '%') { + int perm_ok = 1; for (; offset < oplen; offset++) { if (toupper(op[offset]) == 'R' && !(flags & ACL_READ_PERMISSION)) { flags |= ACL_READ_PERMISSION; } else if (toupper(op[offset]) == 'W' && !(flags & ACL_WRITE_PERMISSION)) { flags |= ACL_WRITE_PERMISSION; - } else if (op[offset] == '~' && flags) { + } else if (op[offset] == '~') { offset++; break; } else { - errno = EINVAL; - return C_ERR; + perm_ok = 0; + break; } } + if (!flags || !perm_ok) { + errno = EINVAL; + return C_ERR; + } } else { flags = ACL_ALL_PERMISSION; } diff --git a/tests/unit/acl-v2.tcl b/tests/unit/acl-v2.tcl index e8229d1b36..b33b53eabc 100644 --- a/tests/unit/acl-v2.tcl +++ b/tests/unit/acl-v2.tcl @@ -116,11 +116,32 @@ start_server {tags {"acl external:skip"}} { assert_match "*NOPERM*key*" $err } - test {Validate read and write permissions format} { + test {Validate read and write permissions format - empty permission} { catch {r ACL SETUSER key-permission-RW %~} err set err } {ERR Error in ACL SETUSER modifier '%~': Syntax error} + test {Validate read and write permissions format - empty selector} { + catch {r ACL SETUSER key-permission-RW %} err + set err + } {ERR Error in ACL SETUSER modifier '%': Syntax error} + + test {Validate read and write permissions format - empty pattern} { + # Empty pattern results with R/W access to no key + r ACL SETUSER key-permission-RW on nopass %RW~ +@all + $r2 auth key-permission-RW password + catch {$r2 SET x 5} err + set err + } {NOPERM No permissions to access a key} + + test {Validate read and write permissions format - no pattern} { + # No pattern results with R/W access to no key (currently we accept this syntax error) + r ACL SETUSER key-permission-RW on nopass %RW +@all + $r2 auth key-permission-RW password + catch {$r2 SET x 5} err + set err + } {NOPERM No permissions to access a key} + test {Test separate read and write permissions on different selectors are not additive} { r ACL SETUSER key-permission-RW-selector on nopass "(%R~read* +@all)" "(%W~write* +@all)" $r2 auth key-permission-RW-selector password From 38910e2ec17c3c3b38dd7d946dd6e6bcbc5cde94 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 6 Jan 2025 14:02:22 -0800 Subject: [PATCH 056/101] Fix LUA garbage collector (CVE-2024-46981) (#1513) Reset GC state before closing the lua VM to prevent user data to be wrongly freed while still might be used on destructor callbacks. Created and publish by Redis in their OSS branch. Signed-off-by: Madelyn Olson Co-authored-by: YaacovHazan --- src/eval.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eval.c b/src/eval.c index e9fac531f5..9aa185d77b 100644 --- a/src/eval.c +++ b/src/eval.c @@ -285,6 +285,7 @@ void scriptingInit(int setup) { void freeLuaScriptsSync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) { dictRelease(lua_scripts); listRelease(lua_scripts_lru_list); + lua_gc(lctx.lua, LUA_GCCOLLECT, 0); lua_close(lua); #if !defined(USE_LIBC) From 990782e3f5ba85eb4aabb1d49ea8a757bd909b09 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 6 Jan 2025 15:46:55 -0800 Subject: [PATCH 057/101] Add tests for acl selectors with no permissions or patterns (#1515) Signed-off-by: Madelyn Olson --- tests/unit/acl-v2.tcl | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/tests/unit/acl-v2.tcl b/tests/unit/acl-v2.tcl index b33b53eabc..bcaab9e817 100644 --- a/tests/unit/acl-v2.tcl +++ b/tests/unit/acl-v2.tcl @@ -116,31 +116,26 @@ start_server {tags {"acl external:skip"}} { assert_match "*NOPERM*key*" $err } - test {Validate read and write permissions format - empty permission} { - catch {r ACL SETUSER key-permission-RW %~} err - set err - } {ERR Error in ACL SETUSER modifier '%~': Syntax error} - - test {Validate read and write permissions format - empty selector} { - catch {r ACL SETUSER key-permission-RW %} err - set err - } {ERR Error in ACL SETUSER modifier '%': Syntax error} - - test {Validate read and write permissions format - empty pattern} { - # Empty pattern results with R/W access to no key - r ACL SETUSER key-permission-RW on nopass %RW~ +@all - $r2 auth key-permission-RW password - catch {$r2 SET x 5} err - set err - } {NOPERM No permissions to access a key} + test {Validate read and write permissions format} { + # Regression tests for CVE-2024-51741 + assert_error "ERR Error in ACL SETUSER modifier '%~': Syntax error" {r ACL SETUSER invalid %~} + assert_error "ERR Error in ACL SETUSER modifier '%': Syntax error" {r ACL SETUSER invalid %} + } - test {Validate read and write permissions format - no pattern} { - # No pattern results with R/W access to no key (currently we accept this syntax error) - r ACL SETUSER key-permission-RW on nopass %RW +@all - $r2 auth key-permission-RW password - catch {$r2 SET x 5} err - set err - } {NOPERM No permissions to access a key} + test {Validate key permissions format - empty and omitted pattern} { + # Empty pattern results with access to only the empty key + r ACL SETUSER key-permission-no-key on nopass %RW~ +@all + assert_equal "User key-permission-no-key has no permissions to access the 'x' key" [r ACL DRYRUN key-permission-no-key GET x] + assert_equal "OK" [r ACL DRYRUN key-permission-no-key GET ""] + + # This is incorrect syntax, it should have `~`, but we'll allow it for compatibility since it does something + r ACL SETUSER key-permission-omit on nopass %RW +@all + assert_equal "User key-permission-omit has no permissions to access the 'x' key" [r ACL DRYRUN key-permission-omit GET x] + assert_equal "OK" [r ACL DRYRUN key-permission-omit GET ""] + + # Assert these two are equivalent + assert_equal [r ACL GETUSER key-permission-omit] [r ACL GETUSER key-permission-no-key] + } test {Test separate read and write permissions on different selectors are not additive} { r ACL SETUSER key-permission-RW-selector on nopass "(%R~read* +@all)" "(%W~write* +@all)" From 794567fe656d7a8e43d279669e3d3f196d1d1239 Mon Sep 17 00:00:00 2001 From: Rueian Date: Tue, 7 Jan 2025 13:54:55 -0800 Subject: [PATCH 058/101] Add `availability_zone` to the HELLO response (#1487) It's inconvenient for client implementations to extract the `availability_zone` information from the `INFO` response. The `INFO` response contains a lot of information that a client implementation typically doesn't need. This PR adds the availability zone to the `HELLO` response. Clients usually already use the `HELLO` command for protocol negotiation and also get the server `version` and `role` from its response. To keep the `HELLO` response small, the field is only added if availability zone is configured. --------- Signed-off-by: Rueian --- src/networking.c | 7 ++++++- tests/unit/protocol.tcl | 34 ++++++++++++++++++++++++++++++++++ tests/unit/tracking.tcl | 17 ----------------- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/networking.c b/src/networking.c index 08e9a56313..86f87deb8b 100644 --- a/src/networking.c +++ b/src/networking.c @@ -4206,7 +4206,7 @@ void helloCommand(client *c) { /* Let's switch to the specified RESP mode. */ if (ver) c->resp = ver; - addReplyMapLen(c, 6 + !server.sentinel_mode); + addReplyMapLen(c, 6 + !server.sentinel_mode + (sdslen(server.availability_zone) != 0)); addReplyBulkCString(c, "server"); addReplyBulkCString(c, server.extended_redis_compat ? "redis" : SERVER_NAME); @@ -4235,6 +4235,11 @@ void helloCommand(client *c) { addReplyBulkCString(c, "modules"); addReplyLoadedModules(c); + + if (sdslen(server.availability_zone) != 0) { + addReplyBulkCString(c, "availability_zone"); + addReplyBulkCBuffer(c, server.availability_zone, sdslen(server.availability_zone)); + } } /* This callback is bound to POST and "Host:" command names. Those are not diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl index f3a2b8e1a8..f0e64368cc 100644 --- a/tests/unit/protocol.tcl +++ b/tests/unit/protocol.tcl @@ -232,6 +232,40 @@ start_server {tags {"protocol network"}} { } +start_server {tags {"protocol hello"}} { + test {HELLO without protover} { + set reply [r HELLO 3] + assert_equal [dict get $reply proto] 3 + + set reply [r HELLO] + assert_equal [dict get $reply proto] 3 + + set reply [r HELLO 2] + assert_equal [dict get $reply proto] 2 + + set reply [r HELLO] + assert_equal [dict get $reply proto] 2 + } + + test {HELLO and availability-zone} { + r CONFIG SET availability-zone myzone + + set reply [r HELLO 3] + assert_equal [dict get $reply availability_zone] myzone + + set reply [r HELLO 2] + assert_equal [dict get $reply availability_zone] myzone + + r CONFIG SET availability-zone "" + + set reply [r HELLO 3] + assert_equal [dict exists $reply availability_zone] 0 + + set reply [r HELLO 2] + assert_equal [dict exists $reply availability_zone] 0 + } +} + start_server {tags {"regression"}} { test "Regression for a crash with blocking ops and pipelining" { set rd [valkey_deferring_client] diff --git a/tests/unit/tracking.tcl b/tests/unit/tracking.tcl index 313293dcb7..9fdc4b79cd 100644 --- a/tests/unit/tracking.tcl +++ b/tests/unit/tracking.tcl @@ -154,23 +154,6 @@ start_server {tags {"tracking network logreqres:skip"}} { assert_equal [dict get $reply proto] 3 } - test {HELLO without protover} { - set reply [r HELLO 3] - assert_equal [dict get $reply proto] 3 - - set reply [r HELLO] - assert_equal [dict get $reply proto] 3 - - set reply [r HELLO 2] - assert_equal [dict get $reply proto] 2 - - set reply [r HELLO] - assert_equal [dict get $reply proto] 2 - - # restore RESP3 for next test - r HELLO 3 - } - test {RESP3 based basic invalidation} { r CLIENT TRACKING off r CLIENT TRACKING on From 50487cca7956f495d601c132ec901e65456a2833 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Tue, 7 Jan 2025 15:43:46 -0800 Subject: [PATCH 059/101] Actually run code coverage on ubuntu 22 (#1522) This commit, https://github.com/valkey-io/valkey/pull/1504, moved the wrong worker to ubuntu 22. We wanted to move codecov and not coverity. Signed-off-by: Madelyn Olson --- .github/workflows/codecov.yml | 2 +- .github/workflows/coverity.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 951b5c2862..cd1f1b20a7 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -10,7 +10,7 @@ concurrency: jobs: code-coverage: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index acf5686c13..2561e4ceb5 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -17,7 +17,7 @@ permissions: jobs: coverity: if: github.repository == 'valkey-io/valkey' - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Download and extract the Coverity Build Tool From 1987c3647d6a55915a833fe8c0adf76bdaa1cdda Mon Sep 17 00:00:00 2001 From: Rueian Date: Tue, 7 Jan 2025 18:04:58 -0800 Subject: [PATCH 060/101] Add `availability_zone` to the HELLO command history (#1524) This PR is a followup for #1487. Signed-off-by: Rueian Co-authored-by: Binbin --- src/commands.def | 3 ++- src/commands/hello.json | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/commands.def b/src/commands.def index f03e44db9f..c5d766e3f8 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1719,6 +1719,7 @@ struct COMMAND_ARG ECHO_Args[] = { #ifndef SKIP_CMD_HISTORY_TABLE /* HELLO history */ commandHistory HELLO_History[] = { +{"8.1.0","A new `availability_zone` field is added to the response if the `availability-zone` config is set."}, {"6.2.0","`protover` made optional; when called without arguments the command reports the current connection's context."}, }; #endif @@ -10911,7 +10912,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("auth","Authenticates the connection.","O(N) where N is the number of passwords defined for the user","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,AUTH_History,1,AUTH_Tips,0,authCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,AUTH_Keyspecs,0,NULL,2),.args=AUTH_Args}, {MAKE_CMD("client","A container for client connection commands.","Depends on subcommand.","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_History,0,CLIENT_Tips,0,NULL,-2,CMD_SENTINEL,0,CLIENT_Keyspecs,0,NULL,0),.subcommands=CLIENT_Subcommands}, {MAKE_CMD("echo","Returns the given string.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,ECHO_History,0,ECHO_Tips,0,echoCommand,2,CMD_LOADING|CMD_STALE|CMD_FAST,ACL_CATEGORY_CONNECTION,ECHO_Keyspecs,0,NULL,1),.args=ECHO_Args}, -{MAKE_CMD("hello","Handshakes with the server.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,HELLO_History,1,HELLO_Tips,0,helloCommand,-1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,HELLO_Keyspecs,0,NULL,1),.args=HELLO_Args}, +{MAKE_CMD("hello","Handshakes with the server.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,HELLO_History,2,HELLO_Tips,0,helloCommand,-1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,HELLO_Keyspecs,0,NULL,1),.args=HELLO_Args}, {MAKE_CMD("ping","Returns the server's liveliness response.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,PING_History,0,PING_Tips,2,pingCommand,-1,CMD_FAST|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,PING_Keyspecs,0,NULL,1),.args=PING_Args}, {MAKE_CMD("quit","Closes the connection.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"just closing the connection","7.2.0","connection",COMMAND_GROUP_CONNECTION,QUIT_History,0,QUIT_Tips,0,quitCommand,-1,CMD_ALLOW_BUSY|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH,ACL_CATEGORY_CONNECTION,QUIT_Keyspecs,0,NULL,0)}, {MAKE_CMD("reset","Resets the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,RESET_History,0,RESET_Tips,0,resetCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,RESET_Keyspecs,0,NULL,0)}, diff --git a/src/commands/hello.json b/src/commands/hello.json index f3fcc5a13c..15fd81c655 100644 --- a/src/commands/hello.json +++ b/src/commands/hello.json @@ -7,6 +7,10 @@ "arity": -1, "function": "helloCommand", "history": [ + [ + "8.1.0", + "A new `availability_zone` field is added to the response if the `availability-zone` config is set." + ], [ "6.2.0", "`protover` made optional; when called without arguments the command reports the current connection's context." From ac096a19efcfa038914fe3d6e2ee1dc542a8af14 Mon Sep 17 00:00:00 2001 From: uriyage <78144248+uriyage@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:28:54 +0200 Subject: [PATCH 061/101] client struct: lazy init components and optimize struct layout (#1405) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Refactor client structure to use modular data components ## Current State The client structure allocates memory for replication / pubsub / multi-keys / module / blocked data for every client, despite these features being used by only a small subset of clients. In addition the current field layout in the client struct is suboptimal, with poor alignment and unnecessary padding between fields, leading to a larger than necessary memory footprint of 896 bytes per client. Furthermore, fields that are frequently accessed together during operations are scattered throughout the struct, resulting in poor cache locality. ## This PR's Change 1. Lazy Initialization - **Components are only allocated when first used:** - PubSubData: Created on first SUBSCRIBE/PUBLISH operation - ReplicationData: Initialized only for replica connections - ModuleData: Allocated when module interaction begins - BlockingState: Created when first blocking command is issued - MultiState: Initialized on MULTI command 2. Memory Layout Optimization: - Grouped related fields for better locality - Moved rarely accessed fields (e.g., client->name) to struct end - Optimized field alignment to eliminate padding 3. Additional changes: - Moved watched_keys to be static allocated in the `mstate` struct - Relocated replication init logic to replication.c ### Key Benefits - **Efficient Memory Usage:** - 45% smaller base client structure - Basic clients now use 528 bytes (down from 896). - Better memory locality for related operations - Performance improvement in high throughput scenarios. No performance regressions in other cases. ### Performance Impact Tested with 650 clients and 512 bytes values. #### Single Thread Performance | Operation | Dataset | New (ops/sec) | Old (ops/sec) | Change % | |------------|---------|---------------|---------------|-----------| | SET | 1 key | 261,799 | 258,261 | +1.37% | | SET | 3M keys | 209,134 | ~209,000 | ~0% | | GET | 1 key | 281,564 | 277,965 | +1.29% | | GET | 3M keys | 231,158 | 228,410 | +1.20% | #### 8 IO Threads Performance | Operation | Dataset | New (ops/sec) | Old (ops/sec) | Change % | |------------|---------|---------------|---------------|-----------| | SET | 1 key | 1,331,578 | 1,331,626 | -0.00% | | SET | 3M keys | 1,254,441 | 1,152,645 | +8.83% | | GET | 1 key | 1,293,149 | 1,289,503 | +0.28% | | GET | 3M keys | 1,152,898 | 1,101,791 | +4.64% | #### Pipeline Performance (3M keys) | Operation | Pipeline Size | New (ops/sec) | Old (ops/sec) | Change % | |-----------|--------------|---------------|---------------|-----------| | SET | 10 | 548,964 | 538,498 | +1.94% | | SET | 20 | 606,148 | 594,872 | +1.89% | | SET | 30 | 631,122 | 616,606 | +2.35% | | GET | 10 | 628,482 | 624,166 | +0.69% | | GET | 20 | 687,371 | 681,659 | +0.84% | | GET | 30 | 725,855 | 721,102 | +0.66% | ### Observations: 1. Single-threaded operations show consistent improvements (1-1.4%) 2. Multi-threaded performance shows significant gains for large datasets: - SET with 3M keys: +8.83% improvement - GET with 3M keys: +4.64% improvement 3. Pipeline operations show consistent improvements: - SET operations: +1.89% to +2.35% - GET operations: +0.66% to +0.84% 4. No performance regressions observed in any test scenario Related issue:https://github.com/valkey-io/valkey/issues/761 --------- Signed-off-by: Uri Yagelnik Signed-off-by: uriyage <78144248+uriyage@users.noreply.github.com> Co-authored-by: Viktor Söderqvist --- src/acl.c | 6 +- src/aof.c | 3 +- src/blocked.c | 127 ++++++++------ src/cluster.c | 14 +- src/cluster_legacy.c | 2 +- src/module.c | 100 ++++++----- src/module.h | 1 + src/multi.c | 113 ++++++------ src/networking.c | 238 ++++++++----------------- src/pubsub.c | 66 +++++-- src/rdb.c | 8 +- src/replication.c | 405 +++++++++++++++++++++++++------------------ src/script.c | 1 + src/server.c | 65 +++---- src/server.h | 258 ++++++++++++++------------- src/timeout.c | 8 +- src/tracking.c | 23 +-- 17 files changed, 761 insertions(+), 677 deletions(-) diff --git a/src/acl.c b/src/acl.c index 0928c43914..184fa54116 100644 --- a/src/acl.c +++ b/src/acl.c @@ -1960,7 +1960,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) { if (getClientType(c) == CLIENT_TYPE_PUBSUB) { /* Check for pattern violations. */ - dictIterator *di = dictGetIterator(c->pubsub_patterns); + dictIterator *di = dictGetIterator(c->pubsub_data->pubsub_patterns); dictEntry *de; while (!kill && ((de = dictNext(di)) != NULL)) { o = dictGetKey(de); @@ -1972,7 +1972,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) { /* Check for channel violations. */ if (!kill) { /* Check for global channels violation. */ - di = dictGetIterator(c->pubsub_channels); + di = dictGetIterator(c->pubsub_data->pubsub_channels); while (!kill && ((de = dictNext(di)) != NULL)) { o = dictGetKey(de); @@ -1983,7 +1983,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) { } if (!kill) { /* Check for shard channels violation. */ - di = dictGetIterator(c->pubsubshard_channels); + di = dictGetIterator(c->pubsub_data->pubsubshard_channels); while (!kill && ((de = dictNext(di)) != NULL)) { o = dictGetKey(de); int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); diff --git a/src/aof.c b/src/aof.c index 8ac44f64c2..3629fa1acf 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1382,7 +1382,8 @@ struct client *createAOFClient(void) { /* We set the fake client as a replica waiting for the synchronization * so that the server will not try to send replies to this client. */ - c->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; + initClientReplicationData(c); + c->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; return c; } diff --git a/src/blocked.c b/src/blocked.c index 39050932d9..d2d6a5d314 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -75,16 +75,25 @@ static void moduleUnblockClientOnKey(client *c, robj *key); static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key); void initClientBlockingState(client *c) { - c->bstate.btype = BLOCKED_NONE; - c->bstate.timeout = 0; - c->bstate.unblock_on_nokey = 0; - c->bstate.keys = dictCreate(&objectKeyHeapPointerValueDictType); - c->bstate.numreplicas = 0; - c->bstate.numlocal = 0; - c->bstate.reploffset = 0; - c->bstate.generic_blocked_list_node = NULL; - c->bstate.module_blocked_handle = NULL; - c->bstate.async_rm_call_handle = NULL; + if (c->bstate) return; + c->bstate = zmalloc(sizeof(blockingState)); + c->bstate->btype = BLOCKED_NONE; + c->bstate->timeout = 0; + c->bstate->unblock_on_nokey = 0; + c->bstate->keys = dictCreate(&objectKeyHeapPointerValueDictType); + c->bstate->numreplicas = 0; + c->bstate->numlocal = 0; + c->bstate->reploffset = 0; + c->bstate->generic_blocked_list_node = NULL; + c->bstate->module_blocked_handle = NULL; + c->bstate->async_rm_call_handle = NULL; +} + +void freeClientBlockingState(client *c) { + if (!c->bstate) return; + dictRelease(c->bstate->keys); + zfree(c->bstate); + c->bstate = NULL; } /* Block a client for the specific operation type. Once the CLIENT_BLOCKED @@ -94,8 +103,10 @@ void blockClient(client *c, int btype) { /* Primary client should never be blocked unless pause or module */ serverAssert(!(c->flag.primary && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE)); + initClientBlockingState(c); + c->flag.blocked = 1; - c->bstate.btype = btype; + c->bstate->btype = btype; if (!c->flag.module) server.blocked_clients++; /* We count blocked client stats on regular clients and not on module clients */ server.blocked_clients_by_type[btype]++; @@ -199,18 +210,18 @@ void queueClientForReprocessing(client *c) { /* Unblock a client calling the right function depending on the kind * of operation the client is blocking for. */ void unblockClient(client *c, int queue_for_reprocessing) { - if (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || c->bstate.btype == BLOCKED_STREAM) { + if (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET || c->bstate->btype == BLOCKED_STREAM) { unblockClientWaitingData(c); - } else if (c->bstate.btype == BLOCKED_WAIT) { + } else if (c->bstate->btype == BLOCKED_WAIT) { unblockClientWaitingReplicas(c); - } else if (c->bstate.btype == BLOCKED_MODULE) { + } else if (c->bstate->btype == BLOCKED_MODULE) { if (moduleClientIsBlockedOnKeys(c)) unblockClientWaitingData(c); unblockClientFromModule(c); - } else if (c->bstate.btype == BLOCKED_POSTPONE) { - serverAssert(c->bstate.postponed_list_node); - listDelNode(server.postponed_clients, c->bstate.postponed_list_node); - c->bstate.postponed_list_node = NULL; - } else if (c->bstate.btype == BLOCKED_SHUTDOWN) { + } else if (c->bstate->btype == BLOCKED_POSTPONE) { + serverAssert(c->bstate->postponed_list_node); + listDelNode(server.postponed_clients, c->bstate->postponed_list_node); + c->bstate->postponed_list_node = NULL; + } else if (c->bstate->btype == BLOCKED_SHUTDOWN) { /* No special cleanup. */ } else { serverPanic("Unknown btype in unblockClient()."); @@ -218,7 +229,7 @@ void unblockClient(client *c, int queue_for_reprocessing) { /* Reset the client for a new query, unless the client has pending command to process * or in case a shutdown operation was canceled and we are still in the processCommand sequence */ - if (!c->flag.pending_command && c->bstate.btype != BLOCKED_SHUTDOWN) { + if (!c->flag.pending_command && c->bstate->btype != BLOCKED_SHUTDOWN) { /* Clients that are not blocked on keys are not reprocessed so we must * call reqresAppendResponse here (for clients blocked on key, * unblockClientOnKey is called, which eventually calls processCommand, @@ -229,12 +240,12 @@ void unblockClient(client *c, int queue_for_reprocessing) { /* We count blocked client stats on regular clients and not on module clients */ if (!c->flag.module) server.blocked_clients--; - server.blocked_clients_by_type[c->bstate.btype]--; + server.blocked_clients_by_type[c->bstate->btype]--; /* Clear the flags, and put the client in the unblocked list so that * we'll process new commands in its query buffer ASAP. */ c->flag.blocked = 0; - c->bstate.btype = BLOCKED_NONE; - c->bstate.unblock_on_nokey = 0; + c->bstate->btype = BLOCKED_NONE; + c->bstate->unblock_on_nokey = 0; removeClientFromTimeoutTable(c); if (queue_for_reprocessing) queueClientForReprocessing(c); } @@ -243,22 +254,22 @@ void unblockClient(client *c, int queue_for_reprocessing) { * send it a reply of some kind. After this function is called, * unblockClient() will be called with the same client as argument. */ void replyToBlockedClientTimedOut(client *c) { - if (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || c->bstate.btype == BLOCKED_STREAM) { + if (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET || c->bstate->btype == BLOCKED_STREAM) { addReplyNullArray(c); updateStatsOnUnblock(c, 0, 0, 0); - } else if (c->bstate.btype == BLOCKED_WAIT) { + } else if (c->bstate->btype == BLOCKED_WAIT) { if (c->cmd->proc == waitCommand) { - addReplyLongLong(c, replicationCountAcksByOffset(c->bstate.reploffset)); + addReplyLongLong(c, replicationCountAcksByOffset(c->bstate->reploffset)); } else if (c->cmd->proc == waitaofCommand) { addReplyArrayLen(c, 2); - addReplyLongLong(c, server.fsynced_reploff >= c->bstate.reploffset); - addReplyLongLong(c, replicationCountAOFAcksByOffset(c->bstate.reploffset)); + addReplyLongLong(c, server.fsynced_reploff >= c->bstate->reploffset); + addReplyLongLong(c, replicationCountAOFAcksByOffset(c->bstate->reploffset)); } else if (c->cmd->proc == clusterCommand) { addReplyErrorObject(c, shared.noreplicaserr); } else { serverPanic("Unknown wait command %s in replyToBlockedClientTimedOut().", c->cmd->declared_name); } - } else if (c->bstate.btype == BLOCKED_MODULE) { + } else if (c->bstate->btype == BLOCKED_MODULE) { moduleBlockedClientTimedOut(c, 0); } else { serverPanic("Unknown btype in replyToBlockedClientTimedOut()."); @@ -274,7 +285,7 @@ void replyToClientsBlockedOnShutdown(void) { listRewind(server.clients, &li); while ((ln = listNext(&li))) { client *c = listNodeValue(ln); - if (c->flag.blocked && c->bstate.btype == BLOCKED_SHUTDOWN) { + if (c->flag.blocked && c->bstate->btype == BLOCKED_SHUTDOWN) { addReplyError(c, "Errors trying to SHUTDOWN. Check logs."); unblockClient(c, 1); } @@ -301,7 +312,7 @@ void disconnectAllBlockedClients(void) { * command processing will start from scratch, and the command will * be either executed or rejected. (unlike LIST blocked clients for * which the command is already in progress in a way. */ - if (c->bstate.btype == BLOCKED_POSTPONE) continue; + if (c->bstate->btype == BLOCKED_POSTPONE) continue; unblockClientOnError(c, "-UNBLOCKED force unblock from blocking operation, " "instance state changed (master -> replica?)"); @@ -386,15 +397,17 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo list *l; int j; + initClientBlockingState(c); + if (!c->flag.reprocessing_command) { /* If the client is re-processing the command, we do not set the timeout * because we need to retain the client's original timeout. */ - c->bstate.timeout = timeout; + c->bstate->timeout = timeout; } for (j = 0; j < numkeys; j++) { /* If the key already exists in the dictionary ignore it. */ - if (!(client_blocked_entry = dictAddRaw(c->bstate.keys, keys[j], NULL))) { + if (!(client_blocked_entry = dictAddRaw(c->bstate->keys, keys[j], NULL))) { continue; } incrRefCount(keys[j]); @@ -411,7 +424,7 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo l = dictGetVal(db_blocked_existing_entry); } listAddNodeTail(l, c); - dictSetVal(c->bstate.keys, client_blocked_entry, listLast(l)); + dictSetVal(c->bstate->keys, client_blocked_entry, listLast(l)); /* We need to add the key to blocking_keys_unblock_on_nokey, if the client * wants to be awakened if key is deleted (like XREADGROUP) */ @@ -425,7 +438,7 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo } } } - c->bstate.unblock_on_nokey = unblock_on_nokey; + c->bstate->unblock_on_nokey = unblock_on_nokey; /* Currently we assume key blocking will require reprocessing the command. * However in case of modules, they have a different way to handle the reprocessing * which does not require setting the pending command flag */ @@ -439,15 +452,15 @@ static void unblockClientWaitingData(client *c) { dictEntry *de; dictIterator *di; - if (dictSize(c->bstate.keys) == 0) return; + if (dictSize(c->bstate->keys) == 0) return; - di = dictGetIterator(c->bstate.keys); + di = dictGetIterator(c->bstate->keys); /* The client may wait for multiple keys, so unblock it for every key. */ while ((de = dictNext(di)) != NULL) { releaseBlockedEntry(c, de, 0); } dictReleaseIterator(di); - dictEmpty(c->bstate.keys, NULL); + dictEmpty(c->bstate->keys, NULL); } static blocking_type getBlockedTypeByType(int type) { @@ -546,7 +559,7 @@ static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key) { if (listLength(l) == 0) { dictDelete(c->db->blocking_keys, key); dictDelete(c->db->blocking_keys_unblock_on_nokey, key); - } else if (c->bstate.unblock_on_nokey) { + } else if (c->bstate->unblock_on_nokey) { unblock_on_nokey_entry = dictFind(c->db->blocking_keys_unblock_on_nokey, key); /* it is not possible to have a client blocked on nokey with no matching entry */ serverAssertWithInfo(c, key, unblock_on_nokey_entry != NULL); @@ -555,7 +568,7 @@ static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key) { dictDelete(c->db->blocking_keys_unblock_on_nokey, key); } } - if (remove_key) dictDelete(c->bstate.keys, key); + if (remove_key) dictDelete(c->bstate->keys, key); } void signalKeyAsReady(serverDb *db, robj *key, int type) { @@ -593,9 +606,9 @@ static void handleClientsBlockedOnKey(readyList *rl) { * module is trying to accomplish right now. * 3. In case of XREADGROUP call we will want to unblock on any change in object type * or in case the key was deleted, since the group is no longer valid. */ - if ((o != NULL && (receiver->bstate.btype == getBlockedTypeByType(o->type))) || - (o != NULL && (receiver->bstate.btype == BLOCKED_MODULE)) || (receiver->bstate.unblock_on_nokey)) { - if (receiver->bstate.btype != BLOCKED_MODULE) + if ((o != NULL && (receiver->bstate->btype == getBlockedTypeByType(o->type))) || + (o != NULL && (receiver->bstate->btype == BLOCKED_MODULE)) || (receiver->bstate->unblock_on_nokey)) { + if (receiver->bstate->btype != BLOCKED_MODULE) unblockClientOnKey(receiver, rl->key); else moduleUnblockClientOnKey(receiver, rl->key); @@ -606,16 +619,17 @@ static void handleClientsBlockedOnKey(readyList *rl) { /* block a client for replica acknowledgement */ void blockClientForReplicaAck(client *c, mstime_t timeout, long long offset, long numreplicas, int numlocal) { - c->bstate.timeout = timeout; - c->bstate.reploffset = offset; - c->bstate.numreplicas = numreplicas; - c->bstate.numlocal = numlocal; + initClientBlockingState(c); + c->bstate->timeout = timeout; + c->bstate->reploffset = offset; + c->bstate->numreplicas = numreplicas; + c->bstate->numlocal = numlocal; listAddNodeHead(server.clients_waiting_acks, c); /* Note that we remember the linked list node where the client is stored, * this way removing the client in unblockClientWaitingReplicas() will not * require a linear scan, but just a constant time operation. */ - serverAssert(c->bstate.client_waiting_acks_list_node == NULL); - c->bstate.client_waiting_acks_list_node = listFirst(server.clients_waiting_acks); + serverAssert(c->bstate->client_waiting_acks_list_node == NULL); + c->bstate->client_waiting_acks_list_node = listFirst(server.clients_waiting_acks); blockClient(c, BLOCKED_WAIT); } @@ -623,11 +637,12 @@ void blockClientForReplicaAck(client *c, mstime_t timeout, long long offset, lon * requesting to avoid processing clients commands which will be processed later * when the it is ready to accept them. */ void blockPostponeClient(client *c) { - c->bstate.timeout = 0; + initClientBlockingState(c); + c->bstate->timeout = 0; blockClient(c, BLOCKED_POSTPONE); listAddNodeTail(server.postponed_clients, c); - serverAssert(c->bstate.postponed_list_node == NULL); - c->bstate.postponed_list_node = listLast(server.postponed_clients); + serverAssert(c->bstate->postponed_list_node == NULL); + c->bstate->postponed_list_node = listLast(server.postponed_clients); /* Mark this client to execute its command */ c->flag.pending_command = 1; } @@ -644,13 +659,13 @@ void blockClientShutdown(client *c) { static void unblockClientOnKey(client *c, robj *key) { dictEntry *de; - de = dictFind(c->bstate.keys, key); + de = dictFind(c->bstate->keys, key); releaseBlockedEntry(c, de, 1); /* Only in case of blocking API calls, we might be blocked on several keys. however we should force unblock the entire blocking keys */ - serverAssert(c->bstate.btype == BLOCKED_STREAM || c->bstate.btype == BLOCKED_LIST || - c->bstate.btype == BLOCKED_ZSET); + serverAssert(c->bstate->btype == BLOCKED_STREAM || c->bstate->btype == BLOCKED_LIST || + c->bstate->btype == BLOCKED_ZSET); /* We need to unblock the client before calling processCommandAndResetClient * because it checks the CLIENT_BLOCKED flag */ @@ -712,7 +727,7 @@ static void moduleUnblockClientOnKey(client *c, robj *key) { * command with timeout reply. */ void unblockClientOnTimeout(client *c) { /* The client has been unlocked (in the moduleUnblocked list), return ASAP. */ - if (c->bstate.btype == BLOCKED_MODULE && isModuleClientUnblocked(c)) return; + if (c->bstate->btype == BLOCKED_MODULE && isModuleClientUnblocked(c)) return; replyToBlockedClientTimedOut(c); if (c->flag.pending_command) c->flag.pending_command = 0; diff --git a/src/cluster.c b/src/cluster.c index 39d9161b9c..309279e0be 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1006,7 +1006,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int /* If CLIENT_MULTI flag is not set EXEC is just going to return an * error. */ if (!c->flag.multi) return myself; - ms = &c->mstate; + ms = c->mstate; } else { /* In order to have a single codepath create a fake Multi State * structure if the client is not in MULTI/EXEC state, this way @@ -1023,7 +1023,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int /* Only valid for sharded pubsub as regular pubsub can operate on any node and bypasses this layer. */ int pubsubshard_included = - (cmd_flags & CMD_PUBSUB) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_PUBSUB)); + (cmd_flags & CMD_PUBSUB) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_PUBSUB)); /* Check that all the keys are in the same hash slot, and obtain this * slot and the node associated. */ @@ -1176,7 +1176,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int * node is a replica and the request is about a hash slot our primary * is serving, we can reply without redirection. */ int is_write_command = - (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); + (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_WRITE)); if ((c->flag.readonly || pubsubshard_included) && !is_write_command && clusterNodeIsReplica(myself) && clusterNodeGetPrimary(myself) == n) { return myself; @@ -1233,14 +1233,14 @@ void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_co * returns 1. Otherwise 0 is returned and no operation is performed. */ int clusterRedirectBlockedClientIfNeeded(client *c) { clusterNode *myself = getMyClusterNode(); - if (c->flag.blocked && (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || - c->bstate.btype == BLOCKED_STREAM || c->bstate.btype == BLOCKED_MODULE)) { + if (c->flag.blocked && (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET || + c->bstate->btype == BLOCKED_STREAM || c->bstate->btype == BLOCKED_MODULE)) { dictEntry *de; dictIterator *di; /* If the client is blocked on module, but not on a specific key, * don't unblock it. */ - if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) return 0; + if (c->bstate->btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) return 0; /* If the cluster is down, unblock the client with the right error. * If the cluster is configured to allow reads on cluster down, we @@ -1252,7 +1252,7 @@ int clusterRedirectBlockedClientIfNeeded(client *c) { } /* All keys must belong to the same slot, so check first key only. */ - di = dictGetIterator(c->bstate.keys); + di = dictGetIterator(c->bstate->keys); if ((de = dictNext(di)) != NULL) { robj *key = dictGetKey(de); int slot = keyHashSlot((char *)key->ptr, sdslen(key->ptr)); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index b59c30126a..0777d6d8c6 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6574,7 +6574,7 @@ void clusterCommandSetSlot(client *c) { * replication, it would also unlikely win the election. * * And 0x702ff is 7.2.255, we only support new versions in this case. */ - if (r->repl_state == REPLICA_STATE_ONLINE && r->replica_version > 0x702ff) { + if (r->repl_data->repl_state == REPLICA_STATE_ONLINE && r->repl_data->replica_version > 0x702ff) { num_eligible_replicas++; } } diff --git a/src/module.c b/src/module.c index dabea59d49..7388dc6a20 100644 --- a/src/module.c +++ b/src/module.c @@ -651,6 +651,19 @@ void *VM_PoolAlloc(ValkeyModuleCtx *ctx, size_t bytes) { * Helpers for modules API implementation * -------------------------------------------------------------------------- */ +static void initClientModuleData(client *c) { + if (c->module_data) return; + c->module_data = zcalloc(sizeof(ClientModuleData)); +} + +void freeClientModuleData(client *c) { + if (!c->module_data) return; + /* Free the ValkeyModuleBlockedClient held onto for reprocessing if not already freed. */ + zfree(c->module_data->module_blocked_client); + zfree(c->module_data); + c->module_data = NULL; +} + void moduleEnqueueLoadModule(sds path, sds *argv, int argc) { int i; struct moduleLoadQueueEntry *loadmod; @@ -721,11 +734,11 @@ void moduleReleaseTempClient(client *c) { c->flag.fake = 1; c->user = NULL; /* Root user */ c->cmd = c->lastcmd = c->realcmd = c->io_parsed_cmd = NULL; - if (c->bstate.async_rm_call_handle) { - ValkeyModuleAsyncRMCallPromise *promise = c->bstate.async_rm_call_handle; + if (c->bstate && c->bstate->async_rm_call_handle) { + ValkeyModuleAsyncRMCallPromise *promise = c->bstate->async_rm_call_handle; promise->c = NULL; /* Remove the client from the promise so it will no longer be possible to abort it. */ freeValkeyModuleAsyncRMCallPromise(promise); - c->bstate.async_rm_call_handle = NULL; + c->bstate->async_rm_call_handle = NULL; } moduleTempClients[moduleTempClientCount++] = c; } @@ -897,7 +910,7 @@ static CallReply *moduleParseReply(client *c, ValkeyModuleCtx *ctx) { void moduleCallCommandUnblockedHandler(client *c) { ValkeyModuleCtx ctx; - ValkeyModuleAsyncRMCallPromise *promise = c->bstate.async_rm_call_handle; + ValkeyModuleAsyncRMCallPromise *promise = c->bstate->async_rm_call_handle; serverAssert(promise); ValkeyModule *module = promise->module; if (!promise->on_unblocked) { @@ -6569,7 +6582,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const .ctx = (ctx->flags & VALKEYMODULE_CTX_AUTO_MEMORY) ? ctx : NULL, }; reply = callReplyCreatePromise(promise); - c->bstate.async_rm_call_handle = promise; + c->bstate->async_rm_call_handle = promise; if (!(call_flags & CMD_CALL_PROPAGATE_AOF)) { /* No need for AOF propagation, set the relevant flags of the client */ c->flag.module_prevent_aof_prop = 1; @@ -7679,7 +7692,7 @@ void VM_LatencyAddSample(const char *event, mstime_t latency) { /* Returns 1 if the client already in the moduleUnblocked list, 0 otherwise. */ int isModuleClientUnblocked(client *c) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; return bc->unblocked == 1; } @@ -7697,7 +7710,7 @@ int isModuleClientUnblocked(client *c) { * The structure ValkeyModuleBlockedClient will be always deallocated when * running the list of clients blocked by a module that need to be unblocked. */ void unblockClientFromModule(client *c) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; /* Call the disconnection callback if any. Note that * bc->disconnect_callback is set to NULL if the client gets disconnected @@ -7765,9 +7778,10 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx, client *c = ctx->client; int islua = scriptIsRunning(); int ismulti = server.in_exec; + initClientBlockingState(c); - c->bstate.module_blocked_handle = zmalloc(sizeof(ValkeyModuleBlockedClient)); - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + c->bstate->module_blocked_handle = zmalloc(sizeof(ValkeyModuleBlockedClient)); + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; ctx->module->blocked_clients++; /* We need to handle the invalid operation of calling modules blocking @@ -7795,7 +7809,7 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx, if (timeout_ms) { mstime_t now = mstime(); if (timeout_ms > LLONG_MAX - now) { - c->bstate.module_blocked_handle = NULL; + c->bstate->module_blocked_handle = NULL; addReplyError(c, "timeout is out of range"); /* 'timeout_ms+now' would overflow */ return bc; } @@ -7803,20 +7817,20 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx, } if (islua || ismulti) { - c->bstate.module_blocked_handle = NULL; + c->bstate->module_blocked_handle = NULL; addReplyError(c, islua ? "Blocking module command called from Lua script" : "Blocking module command called from transaction"); } else if (ctx->flags & VALKEYMODULE_CTX_BLOCKED_REPLY) { - c->bstate.module_blocked_handle = NULL; + c->bstate->module_blocked_handle = NULL; addReplyError(c, "Blocking module command called from a Reply callback context"); } else if (!auth_reply_callback && clientHasModuleAuthInProgress(c)) { - c->bstate.module_blocked_handle = NULL; + c->bstate->module_blocked_handle = NULL; addReplyError(c, "Clients undergoing module based authentication can only be blocked on auth"); } else { if (keys) { blockForKeys(c, BLOCKED_MODULE, keys, numkeys, timeout, flags & VALKEYMODULE_BLOCK_UNBLOCK_DELETED); } else { - c->bstate.timeout = timeout; + c->bstate->timeout = timeout; blockClient(c, BLOCKED_MODULE); } } @@ -7912,7 +7926,7 @@ void moduleUnregisterAuthCBs(ValkeyModule *module) { /* Search for & attempt next module auth callback after skipping the ones already attempted. * Returns the result of the module auth callback. */ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) { - int handle_next_callback = c->module_auth_ctx == NULL; + int handle_next_callback = (!c->module_data || c->module_data->module_auth_ctx == NULL); ValkeyModuleAuthCtx *cur_auth_ctx = NULL; listNode *ln; listIter li; @@ -7922,7 +7936,7 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) { cur_auth_ctx = listNodeValue(ln); /* Skip over the previously attempted auth contexts. */ if (!handle_next_callback) { - handle_next_callback = cur_auth_ctx == c->module_auth_ctx; + handle_next_callback = cur_auth_ctx == c->module_data->module_auth_ctx; continue; } /* Remove the module auth complete flag before we attempt the next cb. */ @@ -7931,7 +7945,8 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) { moduleCreateContext(&ctx, cur_auth_ctx->module, VALKEYMODULE_CTX_NONE); ctx.client = c; *err = NULL; - c->module_auth_ctx = cur_auth_ctx; + initClientModuleData(c); + c->module_data->module_auth_ctx = cur_auth_ctx; result = cur_auth_ctx->auth_cb(&ctx, username, password, err); moduleFreeContext(&ctx); if (result == VALKEYMODULE_AUTH_HANDLED) break; @@ -7947,8 +7962,8 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) { * return the result of the reply callback. */ int attemptBlockedAuthReplyCallback(client *c, robj *username, robj *password, robj **err) { int result = VALKEYMODULE_AUTH_NOT_HANDLED; - if (!c->module_blocked_client) return result; - ValkeyModuleBlockedClient *bc = (ValkeyModuleBlockedClient *)c->module_blocked_client; + if (!c->module_data || !c->module_data->module_blocked_client) return result; + ValkeyModuleBlockedClient *bc = (ValkeyModuleBlockedClient *)c->module_data->module_blocked_client; bc->client = c; if (bc->auth_reply_cb) { ValkeyModuleCtx ctx; @@ -7961,7 +7976,7 @@ int attemptBlockedAuthReplyCallback(client *c, robj *username, robj *password, r moduleFreeContext(&ctx); } moduleInvokeFreePrivDataCallback(c, bc); - c->module_blocked_client = NULL; + c->module_data->module_blocked_client = NULL; c->lastcmd->microseconds += bc->background_duration; bc->module->blocked_clients--; zfree(bc); @@ -7989,7 +8004,7 @@ int checkModuleAuthentication(client *c, robj *username, robj *password, robj ** serverAssert(result == VALKEYMODULE_AUTH_HANDLED); return AUTH_BLOCKED; } - c->module_auth_ctx = NULL; + if (c->module_data) c->module_data->module_auth_ctx = NULL; if (result == VALKEYMODULE_AUTH_NOT_HANDLED) { c->flag.module_auth_has_result = 0; return AUTH_NOT_HANDLED; @@ -8011,7 +8026,7 @@ int checkModuleAuthentication(client *c, robj *username, robj *password, robj ** * This function returns 1 if client was served (and should be unblocked) */ int moduleTryServeClientBlockedOnKey(client *c, robj *key) { int served = 0; - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; /* Protect against re-processing: don't serve clients that are already * in the unblocking list for any reason (including VM_UnblockClient() @@ -8223,14 +8238,14 @@ int moduleUnblockClientByHandle(ValkeyModuleBlockedClient *bc, void *privdata) { /* This API is used by the server core to unblock a client that was blocked * by a module. */ void moduleUnblockClient(client *c) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; moduleUnblockClientByHandle(bc, NULL); } /* Return true if the client 'c' was blocked by a module using * VM_BlockClientOnKeys(). */ int moduleClientIsBlockedOnKeys(client *c) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; return bc->blocked_on_keys; } @@ -8340,7 +8355,7 @@ void moduleHandleBlockedClients(void) { /* Hold onto the blocked client if module auth is in progress. The reply callback is invoked * when the client is reprocessed. */ if (c && clientHasModuleAuthInProgress(c)) { - c->module_blocked_client = bc; + c->module_data->module_blocked_client = bc; } else { /* Free privdata if any. */ moduleInvokeFreePrivDataCallback(c, bc); @@ -8402,9 +8417,9 @@ void moduleHandleBlockedClients(void) { * moduleBlockedClientTimedOut(). */ int moduleBlockedClientMayTimeout(client *c) { - if (c->bstate.btype != BLOCKED_MODULE) return 1; + if (c->bstate->btype != BLOCKED_MODULE) return 1; - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; return (bc && bc->timeout_callback != NULL); } @@ -8420,7 +8435,7 @@ int moduleBlockedClientMayTimeout(client *c) { * of the client synchronously. This ensures that we can reply to the client before * resetClient() is called. */ void moduleBlockedClientTimedOut(client *c, int from_module) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; /* Protect against re-processing: don't serve clients that are already * in the unblocking list for any reason (including VM_UnblockClient() @@ -9559,16 +9574,16 @@ static void eventLoopHandleOneShotEvents(void) { * A client's user can be changed through the AUTH command, module * authentication, and when a client is freed. */ void moduleNotifyUserChanged(client *c) { - if (c->auth_callback) { - c->auth_callback(c->id, c->auth_callback_privdata); + if (!c->module_data || !c->module_data->auth_callback) return; - /* The callback will fire exactly once, even if the user remains - * the same. It is expected to completely clean up the state - * so all references are cleared here. */ - c->auth_callback = NULL; - c->auth_callback_privdata = NULL; - c->auth_module = NULL; - } + c->module_data->auth_callback(c->id, c->module_data->auth_callback_privdata); + + /* The callback will fire exactly once, even if the user remains + * the same. It is expected to completely clean up the state + * so all references are cleared here. */ + c->module_data->auth_callback = NULL; + c->module_data->auth_callback_privdata = NULL; + c->module_data->auth_module = NULL; } void revokeClientAuthentication(client *c) { @@ -9599,9 +9614,9 @@ static void moduleFreeAuthenticatedClients(ValkeyModule *module) { listRewind(server.clients, &li); while ((ln = listNext(&li)) != NULL) { client *c = listNodeValue(ln); - if (!c->auth_module) continue; + if (!c->module_data || !c->module_data->auth_module) continue; - ValkeyModule *auth_module = (ValkeyModule *)c->auth_module; + ValkeyModule *auth_module = (ValkeyModule *)c->module_data->auth_module; if (auth_module == module) { revokeClientAuthentication(c); } @@ -9909,9 +9924,10 @@ static int authenticateClientWithUser(ValkeyModuleCtx *ctx, } if (callback) { - ctx->client->auth_callback = callback; - ctx->client->auth_callback_privdata = privdata; - ctx->client->auth_module = ctx->module; + initClientModuleData(ctx->client); + ctx->client->module_data->auth_callback = callback; + ctx->client->module_data->auth_callback_privdata = privdata; + ctx->client->module_data->auth_module = ctx->module; } if (client_id) { diff --git a/src/module.h b/src/module.h index 78d9341ca9..f4e4de67eb 100644 --- a/src/module.h +++ b/src/module.h @@ -228,5 +228,6 @@ int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime end void moduleDefragGlobals(void); void *moduleGetHandleByName(char *modulename); int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd); +void freeClientModuleData(client *c); #endif /* _MODULE_H_ */ diff --git a/src/multi.c b/src/multi.c index 9e1f019244..9e3aad9d3c 100644 --- a/src/multi.c +++ b/src/multi.c @@ -33,33 +33,42 @@ /* Client state initialization for MULTI/EXEC */ void initClientMultiState(client *c) { - c->mstate.commands = NULL; - c->mstate.count = 0; - c->mstate.cmd_flags = 0; - c->mstate.cmd_inv_flags = 0; - c->mstate.argv_len_sums = 0; - c->mstate.alloc_count = 0; + if (c->mstate) return; + c->mstate = zcalloc(sizeof(multiState)); } -/* Release all the resources associated with MULTI/EXEC state */ -void freeClientMultiState(client *c) { - int j; - - for (j = 0; j < c->mstate.count; j++) { +void freeClientMultiStateCmds(client *c) { + for (int j = 0; j < c->mstate->count; j++) { int i; - multiCmd *mc = c->mstate.commands + j; + multiCmd *mc = c->mstate->commands + j; for (i = 0; i < mc->argc; i++) decrRefCount(mc->argv[i]); zfree(mc->argv); } - zfree(c->mstate.commands); + + zfree(c->mstate->commands); + c->mstate->commands = NULL; +} + +/* Release all the resources associated with MULTI/EXEC state */ +void freeClientMultiState(client *c) { + if (!c->mstate) return; + + freeClientMultiStateCmds(c); + unwatchAllKeys(c); + zfree(c->mstate); + c->mstate = NULL; } void resetClientMultiState(client *c) { - if (c->mstate.commands) { - freeClientMultiState(c); - initClientMultiState(c); - } + if (!c->mstate || !c->mstate->commands) return; + + freeClientMultiStateCmds(c); + c->mstate->count = 0; + c->mstate->cmd_flags = 0; + c->mstate->cmd_inv_flags = 0; + c->mstate->argv_len_sums = 0; + c->mstate->alloc_count = 0; } /* Add a new command into the MULTI commands queue */ @@ -71,26 +80,27 @@ void queueMultiCommand(client *c, uint64_t cmd_flags) { * bother to read previous responses and didn't notice the multi was already * aborted. */ if (c->flag.dirty_cas || c->flag.dirty_exec) return; - if (c->mstate.count == 0) { + if (!c->mstate) initClientMultiState(c); + if (c->mstate->count == 0) { /* If a client is using multi/exec, assuming it is used to execute at least * two commands. Hence, creating by default size of 2. */ - c->mstate.commands = zmalloc(sizeof(multiCmd) * 2); - c->mstate.alloc_count = 2; + c->mstate->commands = zmalloc(sizeof(multiCmd) * 2); + c->mstate->alloc_count = 2; } - if (c->mstate.count == c->mstate.alloc_count) { - c->mstate.alloc_count = c->mstate.alloc_count < INT_MAX / 2 ? c->mstate.alloc_count * 2 : INT_MAX; - c->mstate.commands = zrealloc(c->mstate.commands, sizeof(multiCmd) * (c->mstate.alloc_count)); + if (c->mstate->count == c->mstate->alloc_count) { + c->mstate->alloc_count = c->mstate->alloc_count < INT_MAX / 2 ? c->mstate->alloc_count * 2 : INT_MAX; + c->mstate->commands = zrealloc(c->mstate->commands, sizeof(multiCmd) * (c->mstate->alloc_count)); } - mc = c->mstate.commands + c->mstate.count; + mc = c->mstate->commands + c->mstate->count; mc->cmd = c->cmd; mc->argc = c->argc; mc->argv = c->argv; mc->argv_len = c->argv_len; - c->mstate.count++; - c->mstate.cmd_flags |= cmd_flags; - c->mstate.cmd_inv_flags |= ~cmd_flags; - c->mstate.argv_len_sums += c->argv_len_sum + sizeof(robj *) * c->argc; + c->mstate->count++; + c->mstate->cmd_flags |= cmd_flags; + c->mstate->cmd_inv_flags |= ~cmd_flags; + c->mstate->argv_len_sums += c->argv_len_sum + sizeof(robj *) * c->argc; /* Reset the client's args since we copied them into the mstate and shouldn't * reference them from c anymore. */ @@ -118,6 +128,7 @@ void flagTransaction(client *c) { } void multiCommand(client *c) { + if (!c->mstate) initClientMultiState(c); c->flag.multi = 1; addReply(c, shared.ok); } @@ -195,12 +206,12 @@ void execCommand(client *c) { orig_argv_len = c->argv_len; orig_argc = c->argc; orig_cmd = c->cmd; - addReplyArrayLen(c, c->mstate.count); - for (j = 0; j < c->mstate.count; j++) { - c->argc = c->mstate.commands[j].argc; - c->argv = c->mstate.commands[j].argv; - c->argv_len = c->mstate.commands[j].argv_len; - c->cmd = c->realcmd = c->mstate.commands[j].cmd; + addReplyArrayLen(c, c->mstate->count); + for (j = 0; j < c->mstate->count; j++) { + c->argc = c->mstate->commands[j].argc; + c->argv = c->mstate->commands[j].argv; + c->argv_len = c->mstate->commands[j].argv_len; + c->cmd = c->realcmd = c->mstate->commands[j].cmd; /* ACL permissions are also checked at the time of execution in case * they were changed after the commands were queued. */ @@ -234,10 +245,10 @@ void execCommand(client *c) { } /* Commands may alter argc/argv, restore mstate. */ - c->mstate.commands[j].argc = c->argc; - c->mstate.commands[j].argv = c->argv; - c->mstate.commands[j].argv_len = c->argv_len; - c->mstate.commands[j].cmd = c->cmd; + c->mstate->commands[j].argc = c->argc; + c->mstate->commands[j].argv = c->argv; + c->mstate->commands[j].argv_len = c->argv_len; + c->mstate->commands[j].cmd = c->cmd; /* The original argv has already been processed for slowlog and monitor, * so we can safely free it before proceeding to the next command. */ @@ -304,10 +315,10 @@ void watchForKey(client *c, robj *key) { listNode *ln; watchedKey *wk; - if (listLength(c->watched_keys) == 0) server.watching_clients++; + if (listLength(&c->mstate->watched_keys) == 0) server.watching_clients++; /* Check if we are already watching for this key */ - listRewind(c->watched_keys, &li); + listRewind(&c->mstate->watched_keys, &li); while ((ln = listNext(&li))) { wk = listNodeValue(ln); if (wk->db == c->db && equalStringObjects(key, wk->key)) return; /* Key already watched */ @@ -326,7 +337,7 @@ void watchForKey(client *c, robj *key) { wk->db = c->db; wk->expired = keyIsExpired(c->db, key); incrRefCount(key); - listAddNodeTail(c->watched_keys, wk); + listAddNodeTail(&c->mstate->watched_keys, wk); watchedKeyLinkToClients(clients, wk); } @@ -336,8 +347,8 @@ void unwatchAllKeys(client *c) { listIter li; listNode *ln; - if (listLength(c->watched_keys) == 0) return; - listRewind(c->watched_keys, &li); + if (!c->mstate || listLength(&c->mstate->watched_keys) == 0) return; + listRewind(&c->mstate->watched_keys, &li); while ((ln = listNext(&li))) { list *clients; watchedKey *wk; @@ -350,7 +361,7 @@ void unwatchAllKeys(client *c) { /* Kill the entry at all if this was the only client */ if (listLength(clients) == 0) dictDelete(wk->db->watched_keys, wk->key); /* Remove this watched key from the client->watched list */ - listDelNode(c->watched_keys, ln); + listDelNode(&c->mstate->watched_keys, ln); decrRefCount(wk->key); zfree(wk); } @@ -363,8 +374,8 @@ int isWatchedKeyExpired(client *c) { listIter li; listNode *ln; watchedKey *wk; - if (listLength(c->watched_keys) == 0) return 0; - listRewind(c->watched_keys, &li); + if (!c->mstate || listLength(&c->mstate->watched_keys) == 0) return 0; + listRewind(&c->mstate->watched_keys, &li); while ((ln = listNext(&li))) { wk = listNodeValue(ln); if (wk->expired) continue; /* was expired when WATCH was called */ @@ -474,6 +485,9 @@ void watchCommand(client *c) { addReply(c, shared.ok); return; } + + if (!c->mstate) initClientMultiState(c); + for (j = 1; j < c->argc; j++) watchForKey(c, c->argv[j]); addReply(c, shared.ok); } @@ -485,11 +499,12 @@ void unwatchCommand(client *c) { } size_t multiStateMemOverhead(client *c) { - size_t mem = c->mstate.argv_len_sums; + if (!c->mstate) return 0; + size_t mem = c->mstate->argv_len_sums; /* Add watched keys overhead, Note: this doesn't take into account the watched keys themselves, because they aren't * managed per-client. */ - mem += listLength(c->watched_keys) * (sizeof(listNode) + sizeof(watchedKey)); + mem += listLength(&c->mstate->watched_keys) * (sizeof(listNode) + sizeof(c->mstate->watched_keys)); /* Reserved memory for queued multi commands. */ - mem += c->mstate.alloc_count * sizeof(multiCmd); + mem += c->mstate->alloc_count * sizeof(multiCmd); return mem; } diff --git a/src/networking.c b/src/networking.c index 86f87deb8b..339cd304d4 100644 --- a/src/networking.c +++ b/src/networking.c @@ -119,7 +119,7 @@ int authRequired(client *c) { } static inline int isReplicaReadyForReplData(client *replica) { - return (replica->repl_state == REPLICA_STATE_ONLINE || replica->repl_state == REPLICA_STATE_BG_RDB_LOAD) && + return (replica->repl_data->repl_state == REPLICA_STATE_ONLINE || replica->repl_data->repl_state == REPLICA_STATE_BG_RDB_LOAD) && !(replica->flag.close_asap); } @@ -154,8 +154,6 @@ client *createClient(connection *conn) { c->bufpos = 0; c->buf_peak = c->buf_usable_size; c->buf_peak_last_reset_time = server.unixtime; - c->ref_repl_buf_node = NULL; - c->ref_block_pos = 0; c->qb_pos = 0; c->querybuf = NULL; c->querybuf_peak = 0; @@ -180,55 +178,31 @@ client *createClient(connection *conn) { c->ctime = c->last_interaction = server.unixtime; c->duration = 0; clientSetDefaultAuth(c); - c->repl_state = REPL_STATE_NONE; - c->repl_start_cmd_stream_on_ack = 0; - c->reploff = 0; - c->read_reploff = 0; - c->repl_applied = 0; - c->repl_ack_off = 0; - c->repl_ack_time = 0; - c->repl_aof_off = 0; - c->repl_last_partial_write = 0; - c->replica_listening_port = 0; - c->replica_addr = NULL; - c->replica_version = 0; - c->replica_capa = REPLICA_CAPA_NONE; - c->replica_req = REPLICA_REQ_NONE; - c->associated_rdb_client_id = 0; - c->rdb_client_disconnect_time = 0; c->reply = listCreate(); c->deferred_reply_errors = NULL; c->reply_bytes = 0; c->obuf_soft_limit_reached_time = 0; listSetFreeMethod(c->reply, freeClientReplyValue); listSetDupMethod(c->reply, dupClientReplyValue); - initClientBlockingState(c); + c->repl_data = NULL; + c->bstate = NULL; + c->pubsub_data = NULL; + c->module_data = NULL; + c->mstate = NULL; c->woff = 0; - c->watched_keys = listCreate(); - c->pubsub_channels = dictCreate(&objectKeyPointerValueDictType); - c->pubsub_patterns = dictCreate(&objectKeyPointerValueDictType); - c->pubsubshard_channels = dictCreate(&objectKeyPointerValueDictType); c->peerid = NULL; c->sockname = NULL; c->client_list_node = NULL; c->io_read_state = CLIENT_IDLE; c->io_write_state = CLIENT_IDLE; c->nwritten = 0; - c->client_tracking_redirection = 0; - c->client_tracking_prefixes = NULL; c->last_memory_usage = 0; c->last_memory_type = CLIENT_TYPE_NORMAL; - c->module_blocked_client = NULL; - c->module_auth_ctx = NULL; - c->auth_callback = NULL; - c->auth_callback_privdata = NULL; - c->auth_module = NULL; listInitNode(&c->clients_pending_write_node, c); listInitNode(&c->pending_read_list_node, c); c->mem_usage_bucket = NULL; c->mem_usage_bucket_node = NULL; if (conn) linkClient(c); - initClientMultiState(c); c->net_input_bytes = 0; c->net_input_bytes_curr_cmd = 0; c->net_output_bytes = 0; @@ -266,7 +240,9 @@ void putClientInPendingWriteQueue(client *c) { * if not already done and, for replicas, if the replica can actually receive * writes at this stage. */ if (!c->flag.pending_write && - (c->repl_state == REPL_STATE_NONE || (isReplicaReadyForReplData(c) && !c->repl_start_cmd_stream_on_ack))) { + (!c->repl_data || + c->repl_data->repl_state == REPL_STATE_NONE || + (isReplicaReadyForReplData(c) && !c->repl_data->repl_start_cmd_stream_on_ack))) { /* Here instead of installing the write handler, we just flag the * client and put it into a list of clients that have something * to write to the socket. This way before re-entering the event @@ -1340,10 +1316,10 @@ void deferredAfterErrorReply(client *c, list *errors) { void copyReplicaOutputBuffer(client *dst, client *src) { serverAssert(src->bufpos == 0 && listLength(src->reply) == 0); - if (src->ref_repl_buf_node == NULL) return; - dst->ref_repl_buf_node = src->ref_repl_buf_node; - dst->ref_block_pos = src->ref_block_pos; - ((replBufBlock *)listNodeValue(dst->ref_repl_buf_node))->refcount++; + if (src->repl_data->ref_repl_buf_node == NULL) return; + dst->repl_data->ref_repl_buf_node = src->repl_data->ref_repl_buf_node; + dst->repl_data->ref_block_pos = src->repl_data->ref_block_pos; + ((replBufBlock *)listNodeValue(dst->repl_data->ref_repl_buf_node))->refcount++; } /* Return true if the specified client has pending reply buffers to write to @@ -1353,13 +1329,13 @@ int clientHasPendingReplies(client *c) { /* Replicas use global shared replication buffer instead of * private output buffer. */ serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); - if (c->ref_repl_buf_node == NULL) return 0; + if (c->repl_data->ref_repl_buf_node == NULL) return 0; /* If the last replication buffer block content is totally sent, * we have nothing to send. */ listNode *ln = listLast(server.repl_buffer_blocks); replBufBlock *tail = listNodeValue(ln); - if (ln == c->ref_repl_buf_node && c->ref_block_pos == tail->used) return 0; + if (ln == c->repl_data->ref_repl_buf_node && c->repl_data->ref_block_pos == tail->used) return 0; return 1; } else { @@ -1526,23 +1502,6 @@ void disconnectReplicas(void) { } } -/* Check if there is any other replica waiting dumping RDB finished expect me. - * This function is useful to judge current dumping RDB can be used for full - * synchronization or not. */ -int anyOtherReplicaWaitRdb(client *except_me) { - listIter li; - listNode *ln; - - listRewind(server.replicas, &li); - while ((ln = listNext(&li))) { - client *replica = ln->value; - if (replica != except_me && replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) { - return 1; - } - } - return 0; -} - /* Remove the specified client from global lists where the client could * be referenced, not including the Pub/Sub channels. * This is used by freeClient() and replicationCachePrimary(). */ @@ -1567,7 +1526,7 @@ void unlinkClient(client *c) { /* Check if this is a replica waiting for diskless replication (rdb pipe), * in which case it needs to be cleaned from that list */ - if (c->flag.replica && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_pipe_conns) { + if (c->repl_data && c->flag.replica && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_pipe_conns) { int i; int still_alive = 0; for (i = 0; i < server.rdb_pipe_numconns; i++) { @@ -1653,11 +1612,7 @@ void clearClientConnectionState(client *c) { clientSetDefaultAuth(c); moduleNotifyUserChanged(c); discardTransaction(c); - - pubsubUnsubscribeAllChannels(c, 0); - pubsubUnsubscribeShardAllChannels(c, 0); - pubsubUnsubscribeAllPatterns(c, 0); - unmarkClientAsPubSub(c); + freeClientPubSubData(c); if (c->name) { decrRefCount(c->name); @@ -1696,9 +1651,7 @@ void freeClient(client *c) { /* Notify module system that this client auth status changed. */ moduleNotifyUserChanged(c); - - /* Free the RedisModuleBlockedClient held onto for reprocessing if not already freed. */ - zfree(c->module_blocked_client); + freeClientModuleData(c); /* If this client was scheduled for async freeing we need to remove it * from the queue. Note that we need to do this here, because later @@ -1745,31 +1698,16 @@ void freeClient(client *c) { /* If there is any in-flight command, we don't record their duration. */ c->duration = 0; if (c->flag.blocked) unblockClient(c, 1); - dictRelease(c->bstate.keys); - - /* UNWATCH all the keys */ - unwatchAllKeys(c); - listRelease(c->watched_keys); - c->watched_keys = NULL; - - /* Unsubscribe from all the pubsub channels */ - pubsubUnsubscribeAllChannels(c, 0); - pubsubUnsubscribeShardAllChannels(c, 0); - pubsubUnsubscribeAllPatterns(c, 0); - unmarkClientAsPubSub(c); - dictRelease(c->pubsub_channels); - c->pubsub_channels = NULL; - dictRelease(c->pubsub_patterns); - c->pubsub_patterns = NULL; - dictRelease(c->pubsubshard_channels); - c->pubsubshard_channels = NULL; + + freeClientBlockingState(c); + freeClientPubSubData(c); /* Free data structures. */ listRelease(c->reply); c->reply = NULL; zfree_with_size(c->buf, c->buf_usable_size); c->buf = NULL; - freeReplicaReferencedReplBuffer(c); + freeClientArgv(c); freeClientOriginalArgv(c); if (c->deferred_reply_errors) listRelease(c->deferred_reply_errors); @@ -1787,45 +1725,7 @@ void freeClient(client *c) { * places where active clients may be referenced. */ unlinkClient(c); - /* Primary/replica cleanup Case 1: - * we lost the connection with a replica. */ - if (c->flag.replica) { - /* If there is no any other replica waiting dumping RDB finished, the - * current child process need not continue to dump RDB, then we kill it. - * So child process won't use more memory, and we also can fork a new - * child process asap to dump rdb for next full synchronization or bgsave. - * But we also need to check if users enable 'save' RDB, if enable, we - * should not remove directly since that means RDB is important for users - * to keep data safe and we may delay configured 'save' for full sync. */ - if (server.saveparamslen == 0 && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && - server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK && - anyOtherReplicaWaitRdb(c) == 0) { - serverLog(LL_NOTICE, "Background saving, persistence disabled, last replica dropped, killing fork child."); - killRDBChild(); - } - if (c->repl_state == REPLICA_STATE_SEND_BULK) { - if (c->repldbfd != -1) close(c->repldbfd); - if (c->replpreamble) sdsfree(c->replpreamble); - } - list *l = (c->flag.monitor) ? server.monitors : server.replicas; - ln = listSearchKey(l, c); - serverAssert(ln != NULL); - listDelNode(l, ln); - /* We need to remember the time when we started to have zero - * attached replicas, as after some time we'll free the replication - * backlog. */ - if (getClientType(c) == CLIENT_TYPE_REPLICA && listLength(server.replicas) == 0) - server.repl_no_replicas_since = server.unixtime; - refreshGoodReplicasCount(); - /* Fire the replica change modules event. */ - if (c->repl_state == REPLICA_STATE_ONLINE) - moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE, - NULL); - } - - /* Primary/replica cleanup Case 2: - * we lost the connection with the primary. */ - if (c->flag.primary) replicationHandlePrimaryDisconnection(); + freeClientReplicationData(c); /* Remove client from memory usage buckets */ if (c->mem_usage_bucket) { @@ -1841,7 +1741,6 @@ void freeClient(client *c) { freeClientMultiState(c); sdsfree(c->peerid); sdsfree(c->sockname); - sdsfree(c->replica_addr); zfree(c); } @@ -1932,10 +1831,10 @@ void beforeNextClient(client *c) { * In these scenarios, qb_pos points to the part of the current command * or the beginning of next command, and the current command is not applied yet, * so the repl_applied is not equal to qb_pos. */ - if (c->repl_applied) { - sdsrange(c->querybuf, c->repl_applied, -1); - c->qb_pos -= c->repl_applied; - c->repl_applied = 0; + if (c->repl_data->repl_applied) { + sdsrange(c->querybuf, c->repl_data->repl_applied, -1); + c->qb_pos -= c->repl_data->repl_applied; + c->repl_data->repl_applied = 0; } } else { trimClientQueryBuffer(c); @@ -1974,18 +1873,18 @@ int freeClientsInAsyncFreeQueue(void) { * The primary gives a grace period before freeing this client because * it serves as a reference to the first required replication data block for * this replica */ - if (!c->rdb_client_disconnect_time) { + if (!c->repl_data->rdb_client_disconnect_time) { if (c->conn) connSetReadHandler(c->conn, NULL); - c->rdb_client_disconnect_time = server.unixtime; + c->repl_data->rdb_client_disconnect_time = server.unixtime; dualChannelServerLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds", (unsigned long long)c->id, replicationGetReplicaName(c), server.wait_before_rdb_client_free); } - if (server.unixtime - c->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue; + if (server.unixtime - c->repl_data->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue; dualChannelServerLog( LL_NOTICE, "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). " "Freeing RDB client %llu.", - (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id); + (long int)(server.unixtime - c->repl_data->rdb_client_disconnect_time), (unsigned long long)c->id); c->flag.protected_rdb_channel = 0; } @@ -2015,27 +1914,27 @@ void writeToReplica(client *c) { int nwritten = 0; serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); while (clientHasPendingReplies(c)) { - replBufBlock *o = listNodeValue(c->ref_repl_buf_node); - serverAssert(o->used >= c->ref_block_pos); + replBufBlock *o = listNodeValue(c->repl_data->ref_repl_buf_node); + serverAssert(o->used >= c->repl_data->ref_block_pos); /* Send current block if it is not fully sent. */ - if (o->used > c->ref_block_pos) { - nwritten = connWrite(c->conn, o->buf + c->ref_block_pos, o->used - c->ref_block_pos); + if (o->used > c->repl_data->ref_block_pos) { + nwritten = connWrite(c->conn, o->buf + c->repl_data->ref_block_pos, o->used - c->repl_data->ref_block_pos); if (nwritten <= 0) { c->write_flags |= WRITE_FLAGS_WRITE_ERROR; return; } c->nwritten += nwritten; - c->ref_block_pos += nwritten; + c->repl_data->ref_block_pos += nwritten; } /* If we fully sent the object on head, go to the next one. */ - listNode *next = listNextNode(c->ref_repl_buf_node); - if (next && c->ref_block_pos == o->used) { + listNode *next = listNextNode(c->repl_data->ref_repl_buf_node); + if (next && c->repl_data->ref_block_pos == o->used) { o->refcount--; ((replBufBlock *)(listNodeValue(next)))->refcount++; - c->ref_repl_buf_node = next; - c->ref_block_pos = 0; + c->repl_data->ref_repl_buf_node = next; + c->repl_data->ref_block_pos = 0; incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL); } } @@ -2338,7 +2237,7 @@ int handleReadResult(client *c) { c->last_interaction = server.unixtime; c->net_input_bytes += c->nread; if (c->flag.primary) { - c->read_reploff += c->nread; + c->repl_data->read_reploff += c->nread; server.stat_net_repl_input_bytes += c->nread; } else { server.stat_net_input_bytes += c->nread; @@ -2409,7 +2308,7 @@ parseResult handleParseResults(client *c) { } if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN && getClientType(c) == CLIENT_TYPE_REPLICA) { - c->repl_ack_time = server.unixtime; + c->repl_data->repl_ack_time = server.unixtime; } if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN) { @@ -2993,10 +2892,12 @@ void commandProcessed(client *c) { clusterSlotStatsAddNetworkBytesInForUserClient(c); resetClient(c); - long long prev_offset = c->reploff; + if (!c->repl_data) return; + + long long prev_offset = c->repl_data->reploff; if (c->flag.primary && !c->flag.multi) { /* Update the applied replication offset of our primary. */ - c->reploff = c->read_reploff - sdslen(c->querybuf) + c->qb_pos; + c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos; } /* If the client is a primary we need to compute the difference @@ -3006,10 +2907,10 @@ void commandProcessed(client *c) { * part of the replication stream, will be propagated to the * sub-replicas and to the replication backlog. */ if (c->flag.primary) { - long long applied = c->reploff - prev_offset; + long long applied = c->repl_data->reploff - prev_offset; if (applied) { - replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_applied, applied); - c->repl_applied += applied; + replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_data->repl_applied, applied); + c->repl_data->repl_applied += applied; } } } @@ -3241,7 +3142,7 @@ void readToQueryBuf(client *c) { * so they are also considered a part of the query buffer in a broader sense. * * For unauthenticated clients, the query buffer cannot exceed 1MB at most. */ - size_t qb_memory = sdslen(c->querybuf) + c->mstate.argv_len_sums; + size_t qb_memory = sdslen(c->querybuf) + (c->mstate ? c->mstate->argv_len_sums : 0); if (qb_memory > server.client_max_querybuf_len || (qb_memory > 1024 * 1024 && (c->read_flags & READ_FLAGS_AUTH_REQUIRED))) { c->read_flags |= READ_FLAGS_QB_LIMIT_REACHED; @@ -3369,9 +3270,9 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { size_t obufmem, total_mem = getClientMemoryUsage(client, &obufmem); size_t used_blocks_of_repl_buf = 0; - if (client->ref_repl_buf_node) { + if (client->repl_data && client->repl_data->ref_repl_buf_node) { replBufBlock *last = listNodeValue(listLast(server.repl_buffer_blocks)); - replBufBlock *cur = listNodeValue(client->ref_repl_buf_node); + replBufBlock *cur = listNodeValue(client->repl_data->ref_repl_buf_node); used_blocks_of_repl_buf = last->id - cur->id + 1; } sds ret = sdscatfmt( @@ -3386,15 +3287,15 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { " idle=%I", (long long)(server.unixtime - client->last_interaction), " flags=%s", flags, " db=%i", client->db->id, - " sub=%i", (int)dictSize(client->pubsub_channels), - " psub=%i", (int)dictSize(client->pubsub_patterns), - " ssub=%i", (int)dictSize(client->pubsubshard_channels), - " multi=%i", (client->flag.multi) ? client->mstate.count : -1, - " watch=%i", (int)listLength(client->watched_keys), + " sub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsub_channels) : 0, + " psub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsub_patterns) : 0, + " ssub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsubshard_channels) : 0, + " multi=%i", client->mstate ? client->mstate->count : -1, + " watch=%i", client->mstate ? (int)listLength(&client->mstate->watched_keys) : 0, " qbuf=%U", client->querybuf ? (unsigned long long)sdslen(client->querybuf) : 0, " qbuf-free=%U", client->querybuf ? (unsigned long long)sdsavail(client->querybuf) : 0, " argv-mem=%U", (unsigned long long)client->argv_len_sum, - " multi-mem=%U", (unsigned long long)client->mstate.argv_len_sums, + " multi-mem=%U", client->mstate ? (unsigned long long)client->mstate->argv_len_sums : 0, " rbs=%U", (unsigned long long)client->buf_usable_size, " rbp=%U", (unsigned long long)client->buf_peak, " obl=%U", (unsigned long long)client->bufpos, @@ -3404,7 +3305,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { " events=%s", events, " cmd=%s", client->lastcmd ? client->lastcmd->fullname : "NULL", " user=%s", hide_user_data ? "*redacted*" : (client->user ? client->user->name : "(superuser)"), - " redir=%I", (client->flag.tracking) ? (long long)client->client_tracking_redirection : -1, + " redir=%I", (client->flag.tracking) ? (long long)client->pubsub_data->client_tracking_redirection : -1, " resp=%i", client->resp, " lib-name=%s", client->lib_name ? (char *)client->lib_name->ptr : "", " lib-ver=%s", client->lib_ver ? (char *)client->lib_ver->ptr : "", @@ -3892,6 +3793,7 @@ void clientCommand(client *c) { struct ClientFlags options = {0}; robj **prefix = NULL; size_t numprefix = 0; + initClientPubSubData(c); /* Parse the options. */ for (int j = 3; j < c->argc; j++) { @@ -4031,7 +3933,7 @@ void clientCommand(client *c) { } else if (!strcasecmp(c->argv[1]->ptr, "getredir") && c->argc == 2) { /* CLIENT GETREDIR */ if (c->flag.tracking) { - addReplyLongLong(c, c->client_tracking_redirection); + addReplyLongLong(c, c->pubsub_data->client_tracking_redirection); } else { addReplyLongLong(c, -1); } @@ -4077,17 +3979,17 @@ void clientCommand(client *c) { /* Redirect */ addReplyBulkCString(c, "redirect"); if (c->flag.tracking) { - addReplyLongLong(c, c->client_tracking_redirection); + addReplyLongLong(c, c->pubsub_data->client_tracking_redirection); } else { addReplyLongLong(c, -1); } /* Prefixes */ addReplyBulkCString(c, "prefixes"); - if (c->client_tracking_prefixes) { - addReplyArrayLen(c, raxSize(c->client_tracking_prefixes)); + if (c->pubsub_data->client_tracking_prefixes) { + addReplyArrayLen(c, raxSize(c->pubsub_data->client_tracking_prefixes)); raxIterator ri; - raxStart(&ri, c->client_tracking_prefixes); + raxStart(&ri, c->pubsub_data->client_tracking_prefixes); raxSeek(&ri, "^", NULL, 0); while (raxNext(&ri)) { addReplyBulkCBuffer(c, ri.key, ri.key_len); @@ -4410,9 +4312,9 @@ size_t getClientOutputBufferMemoryUsage(client *c) { size_t repl_buf_size = 0; size_t repl_node_num = 0; size_t repl_node_size = sizeof(listNode) + sizeof(replBufBlock); - if (c->ref_repl_buf_node) { + if (c->repl_data->ref_repl_buf_node) { replBufBlock *last = listNodeValue(listLast(server.repl_buffer_blocks)); - replBufBlock *cur = listNodeValue(c->ref_repl_buf_node); + replBufBlock *cur = listNodeValue(c->repl_data->ref_repl_buf_node); repl_buf_size = last->repl_offset + last->size - cur->repl_offset; repl_node_num = last->id - cur->id + 1; } @@ -4445,8 +4347,8 @@ size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) { /* Add memory overhead of the tracking prefixes, this is an underestimation so we don't need to traverse the entire * rax */ - if (c->client_tracking_prefixes) - mem += c->client_tracking_prefixes->numnodes * (sizeof(raxNode) * sizeof(raxNode *)); + if (c->pubsub_data && c->pubsub_data->client_tracking_prefixes) + mem += c->pubsub_data->client_tracking_prefixes->numnodes * (sizeof(raxNode) * sizeof(raxNode *)); return mem; } @@ -4612,7 +4514,7 @@ void flushReplicasOutputBuffers(void) { * 3. Obviously if the replica is not ONLINE. */ if (isReplicaReadyForReplData(replica) && !(replica->flag.close_asap) && can_receive_writes && - !replica->repl_start_cmd_stream_on_ack && clientHasPendingReplies(replica)) { + !replica->repl_data->repl_start_cmd_stream_on_ack && clientHasPendingReplies(replica)) { writeToClient(replica); } } diff --git a/src/pubsub.c b/src/pubsub.c index 3781fa39aa..27b5611788 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -219,20 +219,20 @@ int serverPubsubShardSubscriptionCount(void) { /* Return the number of channels + patterns a client is subscribed to. */ int clientSubscriptionsCount(client *c) { - return dictSize(c->pubsub_channels) + dictSize(c->pubsub_patterns); + return dictSize(c->pubsub_data->pubsub_channels) + dictSize(c->pubsub_data->pubsub_patterns); } /* Return the number of shard level channels a client is subscribed to. */ int clientShardSubscriptionsCount(client *c) { - return dictSize(c->pubsubshard_channels); + return dictSize(c->pubsub_data->pubsubshard_channels); } dict *getClientPubSubChannels(client *c) { - return c->pubsub_channels; + return c->pubsub_data->pubsub_channels; } dict *getClientPubSubShardChannels(client *c) { - return c->pubsubshard_channels; + return c->pubsub_data->pubsubshard_channels; } /* Return the number of pubsub + pubsub shard level channels @@ -255,6 +255,36 @@ void unmarkClientAsPubSub(client *c) { } } +void initClientPubSubData(client *c) { + if (c->pubsub_data) return; + c->pubsub_data = zmalloc(sizeof(ClientPubSubData)); + c->pubsub_data->pubsub_channels = dictCreate(&objectKeyPointerValueDictType); + c->pubsub_data->pubsub_patterns = dictCreate(&objectKeyPointerValueDictType); + c->pubsub_data->pubsubshard_channels = dictCreate(&objectKeyPointerValueDictType); + c->pubsub_data->client_tracking_redirection = 0; + c->pubsub_data->client_tracking_prefixes = NULL; +} + +void freeClientPubSubData(client *c) { + if (!c->pubsub_data) return; + /* Unsubscribe from all the pubsub channels */ + pubsubUnsubscribeAllChannels(c, 0); + pubsubUnsubscribeShardAllChannels(c, 0); + pubsubUnsubscribeAllPatterns(c, 0); + unmarkClientAsPubSub(c); + dictRelease(c->pubsub_data->pubsub_channels); + c->pubsub_data->pubsub_channels = NULL; + dictRelease(c->pubsub_data->pubsub_patterns); + c->pubsub_data->pubsub_patterns = NULL; + dictRelease(c->pubsub_data->pubsubshard_channels); + c->pubsub_data->pubsubshard_channels = NULL; + if (c->pubsub_data->client_tracking_prefixes) { + disableTracking(c); + } + zfree(c->pubsub_data); + c->pubsub_data = NULL; +} + /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or * 0 if the client was already subscribed to that channel. */ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { @@ -262,6 +292,8 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { int retval = 0; unsigned int slot = 0; + if (!c->pubsub_data) initClientPubSubData(c); + /* Add the channel to the client -> channels hash table */ void *position = dictFindPositionForInsert(type.clientPubSubChannels(c), channel, NULL); if (position) { /* Not yet subscribed to this channel */ @@ -344,7 +376,7 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { dictEntry *entry; while ((entry = dictNext(iter)) != NULL) { client *c = dictGetKey(entry); - int retval = dictDelete(c->pubsubshard_channels, channel); + int retval = dictDelete(c->pubsub_data->pubsubshard_channels, channel); serverAssertWithInfo(c, channel, retval == DICT_OK); addReplyPubsubUnsubscribed(c, channel, pubSubShardType); /* If the client has no other pubsub subscription, @@ -366,7 +398,9 @@ int pubsubSubscribePattern(client *c, robj *pattern) { dict *clients; int retval = 0; - if (dictAdd(c->pubsub_patterns, pattern, NULL) == DICT_OK) { + if (!c->pubsub_data) initClientPubSubData(c); + + if (dictAdd(c->pubsub_data->pubsub_patterns, pattern, NULL) == DICT_OK) { retval = 1; incrRefCount(pattern); /* Add the client to the pattern -> list of clients hash table */ @@ -392,8 +426,10 @@ int pubsubUnsubscribePattern(client *c, robj *pattern, int notify) { dict *clients; int retval = 0; + if (!c->pubsub_data) initClientPubSubData(c); + incrRefCount(pattern); /* Protect the object. May be the same we remove */ - if (dictDelete(c->pubsub_patterns, pattern) == DICT_OK) { + if (dictDelete(c->pubsub_data->pubsub_patterns, pattern) == DICT_OK) { retval = 1; /* Remove the client from the pattern -> clients list hash table */ de = dictFind(server.pubsub_patterns, pattern); @@ -454,9 +490,10 @@ int pubsubUnsubscribeShardAllChannels(client *c, int notify) { * client was subscribed from. */ int pubsubUnsubscribeAllPatterns(client *c, int notify) { int count = 0; + if (!c->pubsub_data) initClientPubSubData(c); - if (dictSize(c->pubsub_patterns) > 0) { - dictIterator *di = dictGetSafeIterator(c->pubsub_patterns); + if (dictSize(c->pubsub_data->pubsub_patterns) > 0) { + dictIterator *di = dictGetSafeIterator(c->pubsub_data->pubsub_patterns); dictEntry *de; while ((de = dictNext(di)) != NULL) { @@ -560,6 +597,8 @@ void subscribeCommand(client *c) { /* UNSUBSCRIBE [channel ...] */ void unsubscribeCommand(client *c) { + if (!c->pubsub_data) initClientPubSubData(c); + if (c->argc == 1) { pubsubUnsubscribeAllChannels(c, 1); } else { @@ -732,6 +771,8 @@ void ssubscribeCommand(client *c) { /* SUNSUBSCRIBE [shardchannel [shardchannel ...]] */ void sunsubscribeCommand(client *c) { + if (!c->pubsub_data) initClientPubSubData(c); + if (c->argc == 1) { pubsubUnsubscribeShardAllChannels(c, 1); } else { @@ -745,12 +786,13 @@ void sunsubscribeCommand(client *c) { } size_t pubsubMemOverhead(client *c) { + if (!c->pubsub_data) return 0; /* PubSub patterns */ - size_t mem = dictMemUsage(c->pubsub_patterns); + size_t mem = dictMemUsage(c->pubsub_data->pubsub_patterns); /* Global PubSub channels */ - mem += dictMemUsage(c->pubsub_channels); + mem += dictMemUsage(c->pubsub_data->pubsub_channels); /* Sharded PubSub channels */ - mem += dictMemUsage(c->pubsubshard_channels); + mem += dictMemUsage(c->pubsub_data->pubsubshard_channels); return mem; } diff --git a/src/rdb.c b/src/rdb.c index 958eac5d4f..32c9021669 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3573,9 +3573,9 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) { listRewind(server.replicas, &li); while ((ln = listNext(&li))) { client *replica = ln->value; - if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { + if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { /* Check replica has the exact requirements */ - if (replica->replica_req != req) continue; + if (replica->repl_data->replica_req != req) continue; conns[connsnum++] = replica->conn; if (dual_channel) { @@ -3646,8 +3646,8 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) { listRewind(server.replicas, &li); while ((ln = listNext(&li))) { client *replica = ln->value; - if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) { - replica->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; + if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) { + replica->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; } } if (!dual_channel) { diff --git a/src/replication.c b/src/replication.c index c5611d5a5a..9913d64d65 100644 --- a/src/replication.c +++ b/src/replication.c @@ -82,10 +82,10 @@ char *replicationGetReplicaName(client *c) { ip[0] = '\0'; buf[0] = '\0'; - if (c->replica_addr || connAddrPeerName(c->conn, ip, sizeof(ip), NULL) != -1) { - char *addr = c->replica_addr ? c->replica_addr : ip; - if (c->replica_listening_port) - formatAddr(buf, sizeof(buf), addr, c->replica_listening_port); + if (c->repl_data->replica_addr || connAddrPeerName(c->conn, ip, sizeof(ip), NULL) != -1) { + char *addr = c->repl_data->replica_addr ? c->repl_data->replica_addr : ip; + if (c->repl_data->replica_listening_port) + formatAddr(buf, sizeof(buf), addr, c->repl_data->replica_listening_port); else snprintf(buf, sizeof(buf), "%s:", addr); } else { @@ -231,7 +231,7 @@ void addRdbReplicaToPsyncWait(client *replica_rdb_client) { dualChannelServerLog(LL_DEBUG, "Add rdb replica %s to waiting psync, with cid %llu, %s ", replicationGetReplicaName(replica_rdb_client), (unsigned long long)replica_rdb_client->id, tail ? "tracking repl-backlog tail" : "no repl-backlog to track"); - replica_rdb_client->ref_repl_buf_node = tail ? ln : NULL; + replica_rdb_client->repl_data->ref_repl_buf_node = tail ? ln : NULL; /* Prevent rdb client from being freed before psync is established. */ replica_rdb_client->flag.protected_rdb_channel = 1; uint64_t id = htonu64(replica_rdb_client->id); @@ -250,8 +250,8 @@ void backfillRdbReplicasToPsyncWait(void) { raxSeek(&iter, "^", NULL, 0); while (raxNext(&iter)) { client *replica_rdb_client = iter.data; - if (replica_rdb_client->ref_repl_buf_node) continue; - replica_rdb_client->ref_repl_buf_node = ln; + if (replica_rdb_client->repl_data->ref_repl_buf_node) continue; + replica_rdb_client->repl_data->ref_repl_buf_node = ln; head->refcount++; dualChannelServerLog(LL_DEBUG, "Attach replica rdb client %llu to repl buf block", (long long unsigned int)replica_rdb_client->id); @@ -263,18 +263,18 @@ void removeReplicaFromPsyncWait(client *replica_main_client) { listNode *ln; replBufBlock *o; /* Get replBufBlock pointed by this replica */ - client *replica_rdb_client = lookupRdbClientByID(replica_main_client->associated_rdb_client_id); - ln = replica_rdb_client->ref_repl_buf_node; + client *replica_rdb_client = lookupRdbClientByID(replica_main_client->repl_data->associated_rdb_client_id); + ln = replica_rdb_client->repl_data->ref_repl_buf_node; o = ln ? listNodeValue(ln) : NULL; if (o != NULL) { serverAssert(o->refcount > 0); o->refcount--; } - replica_rdb_client->ref_repl_buf_node = NULL; + replica_rdb_client->repl_data->ref_repl_buf_node = NULL; replica_rdb_client->flag.protected_rdb_channel = 0; dualChannelServerLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu, repl buffer block %s", replicationGetReplicaName(replica_main_client), - (long long unsigned int)replica_main_client->associated_rdb_client_id, + (long long unsigned int)replica_main_client->repl_data->associated_rdb_client_id, o ? "ref count decreased" : "doesn't exist"); uint64_t id = htonu64(replica_rdb_client->id); raxRemove(server.replicas_waiting_psync, (unsigned char *)&id, sizeof(id), NULL); @@ -291,7 +291,7 @@ int canFeedReplicaReplBuffer(client *replica) { if (replica->flag.repl_rdbonly) return 0; /* Don't feed replicas that are still waiting for BGSAVE to start. */ - if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) return 0; + if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) return 0; return 1; } @@ -396,15 +396,15 @@ void freeReplicaReferencedReplBuffer(client *replica) { replicationGetReplicaName(replica), (long long unsigned int)replica->id); } } - if (replica->ref_repl_buf_node != NULL) { + if (replica->repl_data->ref_repl_buf_node != NULL) { /* Decrease the start buffer node reference count. */ - replBufBlock *o = listNodeValue(replica->ref_repl_buf_node); + replBufBlock *o = listNodeValue(replica->repl_data->ref_repl_buf_node); serverAssert(o->refcount > 0); o->refcount--; incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL); } - replica->ref_repl_buf_node = NULL; - replica->ref_block_pos = 0; + replica->repl_data->ref_repl_buf_node = NULL; + replica->repl_data->ref_block_pos = 0; } /* Replication: Primary side. @@ -486,9 +486,9 @@ void feedReplicationBuffer(char *s, size_t len) { client *replica = ln->value; if (!canFeedReplicaReplBuffer(replica) && !(replica->flag.protected_rdb_channel)) continue; /* Update shared replication buffer start position. */ - if (replica->ref_repl_buf_node == NULL) { - replica->ref_repl_buf_node = start_node; - replica->ref_block_pos = start_pos; + if (replica->repl_data->ref_repl_buf_node == NULL) { + replica->repl_data->ref_repl_buf_node = start_node; + replica->repl_data->ref_block_pos = start_pos; /* Only increase the start block reference count. */ ((replBufBlock *)listNodeValue(start_node))->refcount++; } @@ -771,8 +771,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) { /* Setting output buffer of the replica. */ replBufBlock *o = listNodeValue(node); o->refcount++; - c->ref_repl_buf_node = node; - c->ref_block_pos = offset - o->repl_offset; + c->repl_data->ref_repl_buf_node = node; + c->repl_data->ref_block_pos = offset - o->repl_offset; return server.repl_backlog->histlen - skip; } @@ -805,8 +805,8 @@ int replicationSetupReplicaForFullResync(client *replica, long long offset) { char buf[128]; int buflen; - replica->psync_initial_offset = offset; - replica->repl_state = REPLICA_STATE_WAIT_BGSAVE_END; + replica->repl_data->psync_initial_offset = offset; + replica->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_END; /* We are going to accumulate the incremental changes for this * replica as well. Set replicas_eldb to -1 in order to force to re-emit * a SELECT statement in the replication stream. */ @@ -889,19 +889,19 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) { * 4) Send the backlog data (from the offset to the end) to the replica. */ waitForClientIO(c); c->flag.replica = 1; - if (c->associated_rdb_client_id && lookupRdbClientByID(c->associated_rdb_client_id)) { - c->repl_state = REPLICA_STATE_BG_RDB_LOAD; + if (c->repl_data->associated_rdb_client_id && lookupRdbClientByID(c->repl_data->associated_rdb_client_id)) { + c->repl_data->repl_state = REPLICA_STATE_BG_RDB_LOAD; removeReplicaFromPsyncWait(c); } else { - c->repl_state = REPLICA_STATE_ONLINE; + c->repl_data->repl_state = REPLICA_STATE_ONLINE; } - c->repl_ack_time = server.unixtime; - c->repl_start_cmd_stream_on_ack = 0; + c->repl_data->repl_ack_time = server.unixtime; + c->repl_data->repl_start_cmd_stream_on_ack = 0; listAddNodeTail(server.replicas, c); /* We can't use the connection buffers since they are used to accumulate * new commands at this stage. But we are sure the socket send buffer is * empty so this write will never fail actually. */ - if (c->replica_capa & REPLICA_CAPA_PSYNC2) { + if (c->repl_data->replica_capa & REPLICA_CAPA_PSYNC2) { buflen = snprintf(buf, sizeof(buf), "+CONTINUE %s\r\n", server.replid); } else { buflen = snprintf(buf, sizeof(buf), "+CONTINUE\r\n"); @@ -1003,8 +1003,8 @@ int startBgsaveForReplication(int mincapa, int req) { while ((ln = listNext(&li))) { client *replica = ln->value; - if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { - replica->repl_state = REPL_STATE_NONE; + if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { + replica->repl_data->repl_state = REPL_STATE_NONE; replica->flag.replica = 0; listDelNode(server.replicas, ln); addReplyError(replica, "BGSAVE failed, replication can't continue"); @@ -1021,9 +1021,9 @@ int startBgsaveForReplication(int mincapa, int req) { while ((ln = listNext(&li))) { client *replica = ln->value; - if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { + if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { /* Check replica has the exact requirements */ - if (replica->replica_req != req) continue; + if (replica->repl_data->replica_req != req) continue; replicationSetupReplicaForFullResync(replica, getPsyncInitialOffset()); } } @@ -1037,6 +1037,8 @@ void syncCommand(client *c) { /* ignore SYNC if already replica or in monitor mode */ if (c->flag.replica) return; + initClientReplicationData(c); + /* Wait for any IO pending operation to finish before changing the client state to replica */ waitForClientIO(c); @@ -1089,7 +1091,7 @@ void syncCommand(client *c) { /* Fail sync if replica doesn't support EOF capability but wants a filtered RDB. This is because we force filtered * RDB's to be generated over a socket and not through a file to avoid conflicts with the snapshot files. Forcing * use of a socket is handled, if needed, in `startBgsaveForReplication`. */ - if (c->replica_req & REPLICA_REQ_RDB_MASK && !(c->replica_capa & REPLICA_CAPA_EOF)) { + if (c->repl_data->replica_req & REPLICA_REQ_RDB_MASK && !(c->repl_data->replica_capa & REPLICA_CAPA_EOF)) { addReplyError(c, "Filtered replica requires EOF capability"); return; } @@ -1124,7 +1126,7 @@ void syncCommand(client *c) { * resync on purpose when they are not able to partially * resync. */ if (primary_replid[0] != '?') server.stat_sync_partial_err++; - if (c->replica_capa & REPLICA_CAPA_DUAL_CHANNEL) { + if (c->repl_data->replica_capa & REPLICA_CAPA_DUAL_CHANNEL) { dualChannelServerLog(LL_NOTICE, "Replica %s is capable of dual channel synchronization, and partial sync " "isn't possible. " @@ -1149,9 +1151,9 @@ void syncCommand(client *c) { /* Setup the replica as one waiting for BGSAVE to start. The following code * paths will change the state if we handle the replica differently. */ - c->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; + c->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; if (server.repl_disable_tcp_nodelay) connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ - c->repldbfd = -1; + c->repl_data->repldbfd = -1; c->flag.replica = 1; listAddNodeTail(server.replicas, c); @@ -1183,20 +1185,20 @@ void syncCommand(client *c) { replica = ln->value; /* If the client needs a buffer of commands, we can't use * a replica without replication buffer. */ - if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && + if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && (!(replica->flag.repl_rdbonly) || (c->flag.repl_rdbonly))) break; } /* To attach this replica, we check that it has at least all the * capabilities of the replica that triggered the current BGSAVE * and its exact requirements. */ - if (ln && ((c->replica_capa & replica->replica_capa) == replica->replica_capa) && - c->replica_req == replica->replica_req) { + if (ln && ((c->repl_data->replica_capa & replica->repl_data->replica_capa) == replica->repl_data->replica_capa) && + c->repl_data->replica_req == replica->repl_data->replica_req) { /* Perfect, the server is already registering differences for * another replica. Set the right state, and copy the buffer. * We don't copy buffer if clients don't want. */ if (!c->flag.repl_rdbonly) copyReplicaOutputBuffer(c, replica); - replicationSetupReplicaForFullResync(c, replica->psync_initial_offset); + replicationSetupReplicaForFullResync(c, replica->repl_data->psync_initial_offset); serverLog(LL_NOTICE, "Waiting for end of BGSAVE for SYNC"); } else { /* No way, we need to wait for the next BGSAVE in order to @@ -1213,7 +1215,7 @@ void syncCommand(client *c) { /* CASE 3: There is no BGSAVE is in progress. */ } else { - if (server.repl_diskless_sync && (c->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) { + if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) { /* Diskless replication RDB child is created inside * replicationCron() since we want to delay its start a * few seconds to wait for more replicas to arrive. */ @@ -1222,7 +1224,7 @@ void syncCommand(client *c) { /* We don't have a BGSAVE in progress, let's start one. Diskless * or disk-based mode is determined by replica's capacity. */ if (!hasActiveChildProcess()) { - startBgsaveForReplication(c->replica_capa, c->replica_req); + startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req); } else { serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. " "BGSAVE for replication delayed"); @@ -1232,6 +1234,72 @@ void syncCommand(client *c) { return; } +/* Check if there is any other replica waiting dumping RDB finished expect me. + * This function is useful to judge current dumping RDB can be used for full + * synchronization or not. */ +int anyOtherReplicaWaitRdb(client *except_me) { + listIter li; + listNode *ln; + + listRewind(server.replicas, &li); + while ((ln = listNext(&li))) { + client *replica = ln->value; + if (replica != except_me && replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) { + return 1; + } + } + return 0; +} + +void initClientReplicationData(client *c) { + if (c->repl_data) return; + c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData)); +} + +void freeClientReplicationData(client *c) { + if (!c->repl_data) return; + freeReplicaReferencedReplBuffer(c); + /* Primary/replica cleanup Case 1: + * we lost the connection with a replica. */ + if (c->flag.replica) { + /* If there is no any other replica waiting dumping RDB finished, the + * current child process need not continue to dump RDB, then we kill it. + * So child process won't use more memory, and we also can fork a new + * child process asap to dump rdb for next full synchronization or bgsave. + * But we also need to check if users enable 'save' RDB, if enable, we + * should not remove directly since that means RDB is important for users + * to keep data safe and we may delay configured 'save' for full sync. */ + if (server.saveparamslen == 0 && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && + server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK && + anyOtherReplicaWaitRdb(c) == 0) { + serverLog(LL_NOTICE, "Background saving, persistence disabled, last replica dropped, killing fork child."); + killRDBChild(); + } + if (c->repl_data->repl_state == REPLICA_STATE_SEND_BULK) { + if (c->repl_data->repldbfd != -1) close(c->repl_data->repldbfd); + if (c->repl_data->replpreamble) sdsfree(c->repl_data->replpreamble); + } + list *l = (c->flag.monitor) ? server.monitors : server.replicas; + listNode *ln = listSearchKey(l, c); + serverAssert(ln != NULL); + listDelNode(l, ln); + /* We need to remember the time when we started to have zero + * attached replicas, as after some time we'll free the replication + * backlog. */ + if (getClientType(c) == CLIENT_TYPE_REPLICA && listLength(server.replicas) == 0) + server.repl_no_replicas_since = server.unixtime; + refreshGoodReplicasCount(); + /* Fire the replica change modules event. */ + if (c->repl_data->repl_state == REPLICA_STATE_ONLINE) + moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE, + NULL); + } + if (c->flag.primary) replicationHandlePrimaryDisconnection(); + sdsfree(c->repl_data->replica_addr); + zfree(c->repl_data); + c->repl_data = NULL; +} + /* REPLCONF